KNGCRIMSON commited on
Commit
b5cf002
·
1 Parent(s): 1bfdc85
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. README.md +16 -14
  3. __pycache__/run_augmenter.cpython-313.pyc +0 -0
  4. app.py +245 -0
  5. app/.DS_Store +0 -0
  6. app/.gradio/certificate.pem +31 -0
  7. app/.gradio/flagged/dataset1.csv +3 -0
  8. app/__pycache__/run_augmenter.cpython-313.pyc +0 -0
  9. app/app.ipynb +335 -0
  10. create_negative_samples.py +54 -0
  11. data/crossref-preprint-article-relationships-Aug-2023.csv +1001 -0
  12. fetch_positive_samples.py +37 -0
  13. logo.png +0 -0
  14. notebooks/.DS_Store +0 -0
  15. notebooks/1-0-dataset_development.ipynb +1121 -0
  16. positive_augmented_dataset.csv +11 -0
  17. requirements.txt +84 -0
  18. run_augmenter.py +48 -0
  19. scrap.txt +90 -0
  20. src/.DS_Store +0 -0
  21. src/__init__.py +0 -0
  22. src/__pycache__/__init__.cpython-311.pyc +0 -0
  23. src/__pycache__/__init__.cpython-312.pyc +0 -0
  24. src/__pycache__/__init__.cpython-313.pyc +0 -0
  25. src/dataset/DataAugmenter.py +344 -0
  26. src/dataset/Dataset.py +146 -0
  27. src/dataset/GoodDataAugmenter.py +361 -0
  28. src/dataset/GoodDataset.py +248 -0
  29. src/dataset/NegativeSampler.py +325 -0
  30. src/dataset/__init__.py +2 -0
  31. src/dataset/__pycache__/DataAugmenter.cpython-311.pyc +0 -0
  32. src/dataset/__pycache__/DataAugmenter.cpython-312.pyc +0 -0
  33. src/dataset/__pycache__/DataAugmenter.cpython-313.pyc +0 -0
  34. src/dataset/__pycache__/Dataset.cpython-312.pyc +0 -0
  35. src/dataset/__pycache__/Dataset.cpython-313.pyc +0 -0
  36. src/dataset/__pycache__/GoodDataAugmenter.cpython-313.pyc +0 -0
  37. src/dataset/__pycache__/GoodDataset.cpython-313.pyc +0 -0
  38. src/dataset/__pycache__/NegativeSampler.cpython-313.pyc +0 -0
  39. src/dataset/__pycache__/__init__.cpython-311.pyc +0 -0
  40. src/dataset/__pycache__/__init__.cpython-312.pyc +0 -0
  41. src/dataset/__pycache__/__init__.cpython-313.pyc +0 -0
  42. src/dataset/get_dataset.py +41 -0
  43. src/utils/__init__.py +1 -0
  44. src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  45. src/utils/__pycache__/__init__.cpython-312.pyc +0 -0
  46. src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  47. src/utils/__pycache__/io_utils.cpython-311.pyc +0 -0
  48. src/utils/__pycache__/io_utils.cpython-312.pyc +0 -0
  49. src/utils/__pycache__/io_utils.cpython-313.pyc +0 -0
  50. src/utils/__pycache__/struct_utils.cpython-313.pyc +0 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
README.md CHANGED
@@ -1,14 +1,16 @@
1
- ---
2
- title: MatchPrePrintArticles
3
- emoji: 🌖
4
- colorFrom: green
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.8.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Dataset Creator for Matching PrePrint and Articles
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ # MatchingPubs
2
+
3
+ ## Dataset
4
+
5
+ The `dataset/` directory contains the following main classes:
6
+
7
+ - `DatasetLoader`: Responsible for loading the dataset from various sources.
8
+ - `DatasetProcessor`: Handles preprocessing and cleaning of the dataset.
9
+ - `DatasetAnalyzer`: Provides methods for analyzing and summarizing the dataset.
10
+
11
+ ## Getting the Dataset
12
+
13
+ To get the dataset, run the following command:
14
+
15
+ ```bash
16
+ PYTHONPATH=$(pwd) python src/dataset/get_dataset.py
__pycache__/run_augmenter.cpython-313.pyc ADDED
Binary file (2.33 kB). View file
 
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import pandas as pd
4
+ from src.utils.io_utils import PROJECT_ROOT
5
+ from run_augmenter import negative_sampler , positive_sampler
6
+ from pathlib import Path
7
+
8
+ def augment_interface(factor, type_or_difficulty, use_default, csv_file=None):
9
+ """Negative Tool Sampler: Wrapper to handle negative dataset augmentation."""
10
+ try:
11
+ if use_default:
12
+ input_csv_path = f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv"
13
+ if not Path(input_csv_path).exists():
14
+ return "Error: Default CSV file not found!", None, gr.update(visible=False)
15
+ elif csv_file is not None:
16
+ input_csv_path = csv_file.name
17
+ else:
18
+ return "Error: Please select default or upload a CSV file.", None, gr.update(visible=False)
19
+
20
+ augmented_df = negative_sampler(input_csv_path, factor, type_or_difficulty)
21
+ output_csv_path = "augmented_dataset.csv"
22
+ augmented_df.to_csv(output_csv_path, index=False)
23
+
24
+ return output_csv_path, augmented_df.head(), gr.update(visible=True)
25
+
26
+ except Exception as e:
27
+ return f"Error during processing: {str(e)}", None, gr.update(visible=False)
28
+
29
+
30
+ def positive_sampler_interface(use_default, csv_file=None, size=10, random=True, seed=42, full=False):
31
+ """Positive Tool Sampler: Wrapper to handle positive dataset augmentation with additional arguments."""
32
+ try:
33
+ if use_default:
34
+ input_csv_path = f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv"
35
+ if not Path(input_csv_path).exists():
36
+ return "Error: Default CSV file not found!", None, gr.update(visible=False)
37
+ elif csv_file is not None:
38
+ input_csv_path = csv_file.name
39
+ else:
40
+ return "Error: Please select default or upload a CSV file.", None, gr.update(visible=False)
41
+
42
+ # Call the positive sampler function with additional arguments
43
+ augmented_df = positive_sampler(
44
+ optional_path=input_csv_path,
45
+ size=size,
46
+ random=random,
47
+ seed=seed,
48
+ full=full
49
+ )
50
+ output_csv_path = "positive_augmented_dataset.csv"
51
+ augmented_df.to_csv(output_csv_path, index=False)
52
+
53
+ return output_csv_path, augmented_df.head(), gr.update(visible=True)
54
+
55
+ except Exception as e:
56
+ return f"Error during processing: {str(e)}", None, gr.update(visible=False)
57
+
58
+
59
+ def reset_output():
60
+ """Resets the output fields by returning None and hiding the DataFrame."""
61
+ return None, None, gr.update(visible=False)
62
+
63
+ with gr.Blocks(css=f"""
64
+ .gradio-container {{
65
+ font-family: Arial, sans-serif;
66
+ max-width: 900px;
67
+ margin: auto;
68
+ }}
69
+ h1 {{
70
+ text-align: center;
71
+ color: white;
72
+ font-size: 60px;
73
+ margin-bottom: 0px;
74
+ }}
75
+ h2 {{
76
+ text-align: center;
77
+ color: #ff0000;
78
+ font-size: 16px;
79
+ font-weight: normal;
80
+ margin-top: 0px;
81
+ }}
82
+ .title {{
83
+ text-align: center;
84
+ font-size: 40px;
85
+ margin-top: 30px;
86
+ margin-bottom: 20px;
87
+ }}
88
+ .title .positive {{
89
+ color: #ff0000;
90
+ }}
91
+ .title .negative {{
92
+ color: #ff0000;
93
+ }}
94
+ .title .tool {{
95
+ color: white;
96
+ }}
97
+ .title .sampler {{
98
+ color: #ff0000;
99
+ }}
100
+ .description {{
101
+ text-align: center;
102
+ margin-bottom: 20px;
103
+ }}
104
+ #submit-button {{
105
+ background-color: #ff0000;
106
+ color: white;
107
+ font-size: 16px;
108
+ border: none;
109
+ border-radius: 5px;
110
+ padding: 10px 20px;
111
+ }}
112
+ #reset-button {{
113
+ background-color: #d3d3d3;
114
+ color: black;
115
+ font-size: 16px;
116
+ border: none;
117
+ border-radius: 5px;
118
+ padding: 10px 20px;
119
+ }}
120
+ """) as app:
121
+ # Main Title Section
122
+ gr.Markdown("""
123
+ <h1>ENTC</h1>
124
+ <h2>Entrepreneurship and Technology Commercialization · EPFL</h2>
125
+ """)
126
+
127
+ # Positive Tool Sampler Section
128
+ gr.Markdown("""
129
+ <div class="title">
130
+ <span class="positive">Positive</span>
131
+ <span class="tool">Tool</span>
132
+ <span class="sampler">Sampler</span>
133
+ </div>
134
+ """)
135
+
136
+ gr.Markdown("""
137
+ <p class="description">
138
+ This tool takes a list of DOIs and augments them using the OpenAlex API.
139
+ It is designed to complement the Negative Tool Sampler, enabling the creation of complete datasets.
140
+ </p>
141
+ """)
142
+
143
+ with gr.Group():
144
+ with gr.Row():
145
+ pos_use_default_checkbox = gr.Checkbox(label="Use Default Dataset", value=True)
146
+ pos_csv_file_input = gr.File(label="Upload CSV (optional)", file_types=[".csv"], visible=False)
147
+
148
+ with gr.Row():
149
+ size_input = gr.Number(label="Number of Samples", value=10, info="Specify the number of samples to generate.")
150
+ random_input = gr.Checkbox(label="Sample Randomly", value=True, info="Whether to sample randomly.")
151
+ seed_input = gr.Number(label="Random Seed", value=42, info="Random seed for reproducibility.")
152
+ full_input = gr.Checkbox(label="Full Dataset Mode", value=False, info="Indicate whether to use the full dataset.")
153
+
154
+ with gr.Group():
155
+ pos_output_file = gr.File(label="Download Augmented Dataset")
156
+ pos_dataset_preview = gr.DataFrame(label="Dataset Preview", interactive=False, visible=False)
157
+ with gr.Row():
158
+ pos_submit_button = gr.Button("Submit 🚀", elem_id="submit-button")
159
+ pos_reset_button = gr.Button("Reset 🔄", elem_id="reset-button")
160
+
161
+ # Button Actions
162
+ pos_submit_button.click(
163
+ positive_sampler_interface,
164
+ inputs=[pos_use_default_checkbox, pos_csv_file_input, size_input, random_input, seed_input, full_input],
165
+ outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]
166
+ )
167
+
168
+ pos_reset_button.click(
169
+ reset_output,
170
+ inputs=[],
171
+ outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]
172
+ )
173
+
174
+ # Toggle File Input
175
+ def toggle_pos_csv_input(use_default):
176
+ return gr.update(visible=not use_default)
177
+
178
+ pos_use_default_checkbox.change(
179
+ toggle_pos_csv_input,
180
+ inputs=[pos_use_default_checkbox],
181
+ outputs=[pos_csv_file_input]
182
+ )
183
+
184
+ # Negative Tool Sampler Section
185
+ gr.Markdown("""
186
+ <div class="title">
187
+ <span class="negative">Negative</span>
188
+ <span class="tool">Tool</span>
189
+ <span class="sampler">Sampler</span>
190
+ </div>
191
+ """)
192
+
193
+ gr.Markdown("""
194
+ <p class="description">
195
+ This tool generates datasets by creating negative samples from positive matches between preprints and articles.
196
+ Customize the difficulty and the augmentation factor to meet your needs.
197
+ </p>
198
+ """)
199
+
200
+ with gr.Group():
201
+ with gr.Row():
202
+ factor_input = gr.Number(
203
+ label="Factor (int)", value=1, info="Specify the number of negative samples per positive sample."
204
+ )
205
+ type_dropdown = gr.Dropdown(
206
+ ["random", "similar topics", "overlapping authors", "random authors", "fuzzed title"],
207
+ label="Select Difficulty or Augmentation Type"
208
+ )
209
+ with gr.Row():
210
+ use_default_checkbox = gr.Checkbox(label="Use Default Dataset", value=True)
211
+ csv_file_input = gr.File(label="Upload CSV (optional)", file_types=[".csv"], visible=False)
212
+
213
+ with gr.Group():
214
+ output_file = gr.File(label="Download Augmented Dataset")
215
+ dataset_preview = gr.DataFrame(label="Dataset Preview", interactive=False, visible=False)
216
+ with gr.Row():
217
+ submit_button = gr.Button("Submit 🚀", elem_id="submit-button")
218
+ reset_button = gr.Button("Reset 🔄", elem_id="reset-button")
219
+
220
+ # Button Actions
221
+ submit_button.click(
222
+ augment_interface,
223
+ inputs=[factor_input, type_dropdown, use_default_checkbox, csv_file_input],
224
+ outputs=[output_file, dataset_preview, dataset_preview]
225
+ )
226
+
227
+ reset_button.click(
228
+ reset_output,
229
+ inputs=[],
230
+ outputs=[output_file, dataset_preview, dataset_preview]
231
+ )
232
+
233
+ # Toggle File Input
234
+ def toggle_csv_input(use_default):
235
+ return gr.update(visible=not use_default)
236
+
237
+ use_default_checkbox.change(
238
+ toggle_csv_input,
239
+ inputs=[use_default_checkbox],
240
+ outputs=[csv_file_input]
241
+ )
242
+
243
+ # Launch the app
244
+ if __name__ == "__main__":
245
+ app.launch(share=True)
app/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app/.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Factor (int),Select Augmentation Type or Difficulty,Use Default Dataset,Upload CSV (optional),Download Augmented Dataset,timestamp
2
+ ,,false,,,2024-12-10 22:00:22.460971
3
+ 1,easy,true,,,2024-12-10 22:00:36.882145
app/__pycache__/run_augmenter.cpython-313.pyc ADDED
Binary file (914 Bytes). View file
 
app/app.ipynb ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/giorgosnikolaou/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
13
+ " warnings.warn(\n",
14
+ "[nltk_data] Downloading package words to\n",
15
+ "[nltk_data] /Users/giorgosnikolaou/nltk_data...\n",
16
+ "[nltk_data] Package words is already up-to-date!\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "import gradio as gr\n",
22
+ "import pandas as pd\n",
23
+ "import pandas as pd\n",
24
+ "from src.utils.io_utils import PROJECT_ROOT\n",
25
+ "from run_augmenter import negative_sampler , positive_sampler\n",
26
+ "from pathlib import Path\n"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [
34
+ {
35
+ "name": "stdout",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "Running on local URL: http://127.0.0.1:7860\n",
39
+ "Running on public URL: https://85b886469a8c17104c.gradio.live\n",
40
+ "\n",
41
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
42
+ ]
43
+ },
44
+ {
45
+ "data": {
46
+ "text/html": [
47
+ "<div><iframe src=\"https://85b886469a8c17104c.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
+ ],
49
+ "text/plain": [
50
+ "<IPython.core.display.HTML object>"
51
+ ]
52
+ },
53
+ "metadata": {},
54
+ "output_type": "display_data"
55
+ },
56
+ {
57
+ "name": "stdout",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "random\n"
61
+ ]
62
+ },
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "Negative Sampling: 100%|██████████| 100/100 [00:11<00:00, 8.43it/s]\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "\n",
73
+ "def augment_interface(factor, type_or_difficulty, use_default, csv_file=None):\n",
74
+ " \"\"\"Negative Tool Sampler: Wrapper to handle negative dataset augmentation.\"\"\"\n",
75
+ " try:\n",
76
+ " if use_default:\n",
77
+ " input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n",
78
+ " if not Path(input_csv_path).exists():\n",
79
+ " return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n",
80
+ " elif csv_file is not None:\n",
81
+ " input_csv_path = csv_file.name\n",
82
+ " else:\n",
83
+ " return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n",
84
+ "\n",
85
+ " augmented_df = negative_sampler(input_csv_path, factor, type_or_difficulty)\n",
86
+ " output_csv_path = \"augmented_dataset.csv\"\n",
87
+ " augmented_df.to_csv(output_csv_path, index=False)\n",
88
+ "\n",
89
+ " return output_csv_path, augmented_df.head(), gr.update(visible=True)\n",
90
+ "\n",
91
+ " except Exception as e:\n",
92
+ " return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n",
93
+ "\n",
94
+ "\n",
95
+ "def positive_sampler_interface(use_default, csv_file=None, size=10, random=True, seed=42, full=False):\n",
96
+ " \"\"\"Positive Tool Sampler: Wrapper to handle positive dataset augmentation with additional arguments.\"\"\"\n",
97
+ " try:\n",
98
+ " if use_default:\n",
99
+ " input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n",
100
+ " if not Path(input_csv_path).exists():\n",
101
+ " return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n",
102
+ " elif csv_file is not None:\n",
103
+ " input_csv_path = csv_file.name\n",
104
+ " else:\n",
105
+ " return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n",
106
+ "\n",
107
+ " # Call the positive sampler function with additional arguments\n",
108
+ " augmented_df = positive_sampler(\n",
109
+ " optional_path=input_csv_path, \n",
110
+ " size=size, \n",
111
+ " random=random, \n",
112
+ " seed=seed, \n",
113
+ " full=full\n",
114
+ " )\n",
115
+ " output_csv_path = \"positive_augmented_dataset.csv\"\n",
116
+ " augmented_df.to_csv(output_csv_path, index=False)\n",
117
+ "\n",
118
+ " return output_csv_path, augmented_df.head(), gr.update(visible=True)\n",
119
+ "\n",
120
+ " except Exception as e:\n",
121
+ " return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n",
122
+ "\n",
123
+ "\n",
124
+ "def reset_output():\n",
125
+ " \"\"\"Resets the output fields by returning None and hiding the DataFrame.\"\"\"\n",
126
+ " return None, None, gr.update(visible=False)\n",
127
+ "\n",
128
+ "with gr.Blocks(css=f\"\"\"\n",
129
+ " .gradio-container {{\n",
130
+ " font-family: Arial, sans-serif;\n",
131
+ " max-width: 900px;\n",
132
+ " margin: auto;\n",
133
+ " }}\n",
134
+ " h1 {{\n",
135
+ " text-align: center;\n",
136
+ " color: white;\n",
137
+ " font-size: 60px;\n",
138
+ " margin-bottom: 0px;\n",
139
+ " }}\n",
140
+ " h2 {{\n",
141
+ " text-align: center;\n",
142
+ " color: #ff0000;\n",
143
+ " font-size: 16px;\n",
144
+ " font-weight: normal;\n",
145
+ " margin-top: 0px;\n",
146
+ " }}\n",
147
+ " .title {{\n",
148
+ " text-align: center;\n",
149
+ " font-size: 40px;\n",
150
+ " margin-top: 30px;\n",
151
+ " margin-bottom: 20px;\n",
152
+ " }}\n",
153
+ " .title .positive {{\n",
154
+ " color: #ff0000;\n",
155
+ " }}\n",
156
+ " .title .negative {{\n",
157
+ " color: #ff0000;\n",
158
+ " }}\n",
159
+ " .title .tool {{\n",
160
+ " color: white;\n",
161
+ " }}\n",
162
+ " .title .sampler {{\n",
163
+ " color: #ff0000;\n",
164
+ " }}\n",
165
+ " .description {{\n",
166
+ " text-align: center;\n",
167
+ " margin-bottom: 20px;\n",
168
+ " }}\n",
169
+ " #submit-button {{\n",
170
+ " background-color: #ff0000;\n",
171
+ " color: white;\n",
172
+ " font-size: 16px;\n",
173
+ " border: none;\n",
174
+ " border-radius: 5px;\n",
175
+ " padding: 10px 20px;\n",
176
+ " }}\n",
177
+ " #reset-button {{\n",
178
+ " background-color: #d3d3d3;\n",
179
+ " color: black;\n",
180
+ " font-size: 16px;\n",
181
+ " border: none;\n",
182
+ " border-radius: 5px;\n",
183
+ " padding: 10px 20px;\n",
184
+ " }}\n",
185
+ "\"\"\") as app:\n",
186
+ " # Main Title Section\n",
187
+ " gr.Markdown(\"\"\"\n",
188
+ " <h1>ENTC</h1>\n",
189
+ " <h2>Entrepreneurship and Technology Commercialization · EPFL</h2>\n",
190
+ " \"\"\")\n",
191
+ "\n",
192
+ " # Positive Tool Sampler Section\n",
193
+ " gr.Markdown(\"\"\"\n",
194
+ " <div class=\"title\">\n",
195
+ " <span class=\"positive\">Positive</span>\n",
196
+ " <span class=\"tool\">Tool</span>\n",
197
+ " <span class=\"sampler\">Sampler</span>\n",
198
+ " </div>\n",
199
+ " \"\"\")\n",
200
+ "\n",
201
+ " gr.Markdown(\"\"\"\n",
202
+ " <p class=\"description\">\n",
203
+ " This tool takes a list of DOIs and augments them using the OpenAlex API.\n",
204
+ " It is designed to complement the Negative Tool Sampler, enabling the creation of complete datasets.\n",
205
+ " </p>\n",
206
+ " \"\"\")\n",
207
+ "\n",
208
+ " with gr.Group():\n",
209
+ " with gr.Row():\n",
210
+ " pos_use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n",
211
+ " pos_csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n",
212
+ "\n",
213
+ " with gr.Row():\n",
214
+ " size_input = gr.Number(label=\"Number of Samples\", value=10, info=\"Specify the number of samples to generate.\")\n",
215
+ " random_input = gr.Checkbox(label=\"Sample Randomly\", value=True, info=\"Whether to sample randomly.\")\n",
216
+ " seed_input = gr.Number(label=\"Random Seed\", value=42, info=\"Random seed for reproducibility.\")\n",
217
+ " full_input = gr.Checkbox(label=\"Full Dataset Mode\", value=False, info=\"Indicate whether to use the full dataset.\")\n",
218
+ "\n",
219
+ " with gr.Group():\n",
220
+ " pos_output_file = gr.File(label=\"Download Augmented Dataset\")\n",
221
+ " pos_dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n",
222
+ " with gr.Row():\n",
223
+ " pos_submit_button = gr.Button(\"Submit 🚀\", elem_id=\"submit-button\")\n",
224
+ " pos_reset_button = gr.Button(\"Reset 🔄\", elem_id=\"reset-button\")\n",
225
+ "\n",
226
+ " # Button Actions\n",
227
+ " pos_submit_button.click(\n",
228
+ " positive_sampler_interface,\n",
229
+ " inputs=[pos_use_default_checkbox, pos_csv_file_input, size_input, random_input, seed_input, full_input],\n",
230
+ " outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n",
231
+ " )\n",
232
+ "\n",
233
+ " pos_reset_button.click(\n",
234
+ " reset_output,\n",
235
+ " inputs=[],\n",
236
+ " outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n",
237
+ " )\n",
238
+ "\n",
239
+ " # Toggle File Input\n",
240
+ " def toggle_pos_csv_input(use_default):\n",
241
+ " return gr.update(visible=not use_default)\n",
242
+ "\n",
243
+ " pos_use_default_checkbox.change(\n",
244
+ " toggle_pos_csv_input,\n",
245
+ " inputs=[pos_use_default_checkbox],\n",
246
+ " outputs=[pos_csv_file_input]\n",
247
+ " )\n",
248
+ "\n",
249
+ " # Negative Tool Sampler Section\n",
250
+ " gr.Markdown(\"\"\"\n",
251
+ " <div class=\"title\">\n",
252
+ " <span class=\"negative\">Negative</span>\n",
253
+ " <span class=\"tool\">Tool</span>\n",
254
+ " <span class=\"sampler\">Sampler</span>\n",
255
+ " </div>\n",
256
+ " \"\"\")\n",
257
+ "\n",
258
+ " gr.Markdown(\"\"\"\n",
259
+ " <p class=\"description\">\n",
260
+ " This tool generates datasets by creating negative samples from positive matches between preprints and articles.\n",
261
+ " Customize the difficulty and the augmentation factor to meet your needs.\n",
262
+ " </p>\n",
263
+ " \"\"\")\n",
264
+ "\n",
265
+ " with gr.Group():\n",
266
+ " with gr.Row():\n",
267
+ " factor_input = gr.Number(\n",
268
+ " label=\"Factor (int)\", value=1, info=\"Specify the number of negative samples per positive sample.\"\n",
269
+ " )\n",
270
+ " type_dropdown = gr.Dropdown(\n",
271
+ " [\"random\", \"similar topics\", \"overlapping authors\", \"random authors\", \"fuzzed title\"],\n",
272
+ " label=\"Select Difficulty or Augmentation Type\"\n",
273
+ " )\n",
274
+ " with gr.Row():\n",
275
+ " use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n",
276
+ " csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n",
277
+ "\n",
278
+ " with gr.Group():\n",
279
+ " output_file = gr.File(label=\"Download Augmented Dataset\")\n",
280
+ " dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n",
281
+ " with gr.Row():\n",
282
+ " submit_button = gr.Button(\"Submit 🚀\", elem_id=\"submit-button\")\n",
283
+ " reset_button = gr.Button(\"Reset 🔄\", elem_id=\"reset-button\")\n",
284
+ "\n",
285
+ " # Button Actions\n",
286
+ " submit_button.click(\n",
287
+ " augment_interface,\n",
288
+ " inputs=[factor_input, type_dropdown, use_default_checkbox, csv_file_input],\n",
289
+ " outputs=[output_file, dataset_preview, dataset_preview]\n",
290
+ " )\n",
291
+ "\n",
292
+ " reset_button.click(\n",
293
+ " reset_output,\n",
294
+ " inputs=[],\n",
295
+ " outputs=[output_file, dataset_preview, dataset_preview]\n",
296
+ " )\n",
297
+ "\n",
298
+ " # Toggle File Input\n",
299
+ " def toggle_csv_input(use_default):\n",
300
+ " return gr.update(visible=not use_default)\n",
301
+ "\n",
302
+ " use_default_checkbox.change(\n",
303
+ " toggle_csv_input,\n",
304
+ " inputs=[use_default_checkbox],\n",
305
+ " outputs=[csv_file_input]\n",
306
+ " )\n",
307
+ "\n",
308
+ "# Launch the app\n",
309
+ "if __name__ == \"__main__\":\n",
310
+ " app.launch(share=True)\n"
311
+ ]
312
+ }
313
+ ],
314
+ "metadata": {
315
+ "kernelspec": {
316
+ "display_name": "marple",
317
+ "language": "python",
318
+ "name": "python3"
319
+ },
320
+ "language_info": {
321
+ "codemirror_mode": {
322
+ "name": "ipython",
323
+ "version": 3
324
+ },
325
+ "file_extension": ".py",
326
+ "mimetype": "text/x-python",
327
+ "name": "python",
328
+ "nbconvert_exporter": "python",
329
+ "pygments_lexer": "ipython3",
330
+ "version": "3.9.6"
331
+ }
332
+ },
333
+ "nbformat": 4,
334
+ "nbformat_minor": 2
335
+ }
create_negative_samples.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.GoodDataset import *
2
+ from src.dataset.NegativeSampler import *
3
+ import argparse
4
+ import os
5
+
6
+ def main(config):
7
+ """
8
+ Main function to process the dataset and save it as a CSV file.
9
+ Args:
10
+ config: Namespace object containing the script arguments.
11
+ """
12
+ dataset = AugmentedDataset()
13
+ dataset.load(config.input)
14
+
15
+ sampler = NegativeSampler(dataset)
16
+ sampler.create_negative_samples(config)
17
+
18
+
19
+ print(custom_struct_to_df(dataset.negative_samples).head())
20
+ custom_struct_to_df(dataset.positive_samples).to_csv('./data/pos.csv', index=False)
21
+ custom_struct_to_df(dataset.negative_samples).to_csv('./data/neg.csv', index=False)
22
+ print(len(dataset.positive_samples))
23
+ print(len(dataset.negative_samples))
24
+
25
+
26
+ if __name__ == "__main__":
27
+ # Parse command-line arguments
28
+ from src.utils.io_utils import PROJECT_ROOT
29
+ parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.")
30
+
31
+
32
+ parser.add_argument("-i", "--input", type=str, default=os.path.join(PROJECT_ROOT, "data/positive_samples.pkl"), help="Input file path to load the positive samples.")
33
+ parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/negative_samples.pkl"), help="Output file path to save the negative samples.")
34
+
35
+ parser.add_argument("-s", "--seed", type=int, default=42, help="Random seed for reproducibility.")
36
+
37
+ parser.add_argument("-r", "--random", action='store_true', help="Utilization of `sample_random`")
38
+ parser.add_argument("-f", "--fuzz_title", action='store_true', help="Utilization of `fuzz_title`")
39
+ parser.add_argument("-ra", "--replace_auth", action='store_true', help="Utilization of `sample_authors_overlap_random`")
40
+ parser.add_argument("-oa", "--overlap_auth", action='store_true', help="Utilization of `sample_authors_overlap`")
41
+ parser.add_argument("-ot", "--overlap_topic", action='store_true', help="Utilization of `sample_similar_topic`")
42
+
43
+ parser.add_argument("--factor_max", type=int, default=4, help="Maximum number of negative samples to generate per positive sample.")
44
+ parser.add_argument("--authors_to_consider", type=int, default=1, help="Number of authors to consider when overlapping authors.")
45
+ parser.add_argument("--overlapping_authors", type=int, default=1, help="Minimum number of overlapping authors required.")
46
+ parser.add_argument("--fuzz_count", type=int, default=-1, help="Number of words to replace when fuzzing titles.")
47
+
48
+ # Parse the arguments and pass to the main function
49
+ config = parser.parse_args()
50
+ if config.overlap_auth and config.overlap_topic:
51
+ parser.error("Only one of --overlap_auth and --overlap_topic can be used.")
52
+ if not (config.overlap_auth or config.overlap_topic or config.random):
53
+ parser.error("At least one of --overlap_auth, --overlap_topic, or --random must be specified.")
54
+ main(config)
data/crossref-preprint-article-relationships-Aug-2023.csv ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprint_doi,article_doi,deposited_by_article_publisher,deposited_by_preprint_publisher,matching_confidence_score
2
+ 10.5194/wcd-2021-52,10.5194/wcd-2-1245-2021,True,True,0.9919484702093396
3
+ 10.5194/tc-2020-96,10.5194/tc-15-1277-2021,True,True,1.0
4
+ 10.1101/001586,10.1016/j.bica.2014.02.003,False,True,1.0
5
+ 10.2196/preprints.23492,10.2196/23492,False,True,1.0
6
+ 10.5194/acpd-9-11659-2009,10.5194/acp-9-9349-2009,True,True,1.0
7
+ 10.31235/osf.io/nj43g,10.31014/aior.1991.04.01.262,False,True,0.9456161616161616
8
+ 10.2196/preprints.16461,10.2196/16461,False,True,1.0
9
+ 10.5194/acpd-10-523-2010,10.5194/acp-10-4699-2010,True,True,1.0
10
+ 10.20944/preprints201910.0177.v1,10.3390/ma12223739,False,True,
11
+ 10.20944/preprints201903.0234.v1,10.35513/21658005.2019.1.5,False,True,1.0
12
+ 10.31234/osf.io/6ythf,10.1017/s0140525x19002206,False,True,1.0
13
+ 10.2196/preprints.11905,10.2196/11905,False,True,1.0
14
+ 10.1101/665364,10.1016/j.dyepig.2019.107863,False,True,0.936810016689432
15
+ 10.1101/2020.06.10.20067116,10.1098/rspa.2019.0790,False,True,1.0
16
+ 10.2196/preprints.19048,10.2196/19048,False,True,1.0
17
+ 10.2196/preprints.29042,10.2196/29042,False,True,1.0
18
+ 10.21203/rs.2.14007/v2,10.1186/s12885-019-6361-2,False,True,1.0
19
+ 10.21203/rs.2.14007/v3,10.1186/s12885-019-6361-2,False,True,1.0
20
+ 10.21203/rs.2.14007/v1,10.1186/s12885-019-6361-2,False,True,
21
+ 10.2196/preprints.27257,10.2196/27257,False,True,1.0
22
+ 10.5194/hessd-7-7121-2010,10.5194/hess-15-197-2011,True,True,1.0
23
+ 10.20944/preprints201801.0038.v1,10.3390/e20030160,False,True,1.0
24
+ 10.20944/preprints201810.0314.v1,10.3390/v10110603,False,True,0.9962962962962963
25
+ 10.1101/2020.03.21.001347,10.21914/anziamj.v61i0.15040,False,True,0.9722222222222224
26
+ 10.1101/853283,10.1016/j.brainres.2019.146627,False,True,1.0
27
+ 10.2196/preprints.19021,10.2196/19021,False,True,1.0
28
+ 10.5194/gmd-2019-113,10.5194/gmd-13-4845-2020,True,True,1.0
29
+ 10.20944/preprints201711.0016.v1,10.3390/sym9120292,False,True,0.9916666666666668
30
+ 10.5194/osd-12-1567-2015,10.5194/os-12-39-2016,True,True,1.0
31
+ 10.20944/preprints201807.0566.v1,10.3390/sym10100451,False,True,
32
+ 10.5194/hessd-5-1371-2008,10.5194/hess-14-2243-2010,True,True,0.8916666666666666
33
+ 10.31219/osf.io/s93tx,10.1080/19419899.2021.1875595,False,True,1.0
34
+ 10.31730/osf.io/cxety,10.35409/ijbmer.2019.2421,False,True,0.9907407407407408
35
+ 10.1101/675090,10.1371/journal.pone.0219107,False,True,1.0
36
+ 10.20944/preprints202101.0270.v1,10.3390/s21051871,False,True,0.9498525073746312
37
+ 10.5194/acp-2020-1010,10.5194/acp-21-9585-2021,True,True,0.998531571218796
38
+ 10.31219/osf.io/j28d4,10.31014/aior.1992.03.04.310,False,True,
39
+ 10.31222/osf.io/c5xu8,10.31014/aior.1992.03.04.310,False,True,0.8857142857142857
40
+ 10.5194/os-2016-41,10.5194/os-12-1279-2016,True,True,1.0
41
+ 10.1101/790352,10.1016/j.neuroscience.2020.02.016,False,True,0.9743589743589745
42
+ 10.2196/preprints.10722,10.2196/10722,False,True,1.0
43
+ 10.20944/preprints201710.0189.v1,10.3390/soc8010005,False,True,1.0
44
+ 10.2196/preprints.17323,10.2196/17323,False,True,1.0
45
+ 10.20944/preprints202005.0002.v1,10.3390/app10113953,False,True,1.0
46
+ 10.2196/preprints.6578,10.2196/pediatrics.6578,False,True,1.0
47
+ 10.21034/dp.44,10.1016/s0927-5398(01)00040-8,False,True,
48
+ 10.20944/preprints201608.0072.v1,10.3390/fermentation2030016,False,True,1.0
49
+ 10.5194/esurf-2021-105,10.5194/esurf-10-875-2022,True,True,0.9392265193370166
50
+ 10.20944/preprints201809.0407.v1,10.3390/sym10100514,False,True,1.0
51
+ 10.20944/preprints201807.0609.v1,10.3390/en11082171,False,True,0.9858156028368796
52
+ 10.20944/preprints201806.0018.v1,10.3390/su10072312,False,True,0.978494623655914
53
+ 10.5194/bg-2021-170,10.5194/bg-18-6167-2021,True,True,0.9986824769433466
54
+ 10.21203/rs.2.454/v1,10.1186/s13063-019-3712-x,False,True,0.9910714285714284
55
+ 10.21203/rs.2.454/v2,10.1186/s13063-019-3712-x,False,True,0.9910714285714284
56
+ 10.1101/2020.02.04.934588,10.1016/j.cnsns.2020.105373,False,True,0.9941520467836256
57
+ 10.31235/osf.io/9uw6j,10.1017/aap.2019.4,False,True,1.0
58
+ 10.20944/preprints201802.0123.v1,10.3390/su10040947,False,True,0.9989350372736956
59
+ 10.31219/osf.io/nvz85,10.13189/ujer.2020.082273,False,True,0.975438596491228
60
+ 10.21203/rs.3.rs-68784/v2,10.1186/s13018-020-02188-2,False,True,1.0
61
+ 10.21203/rs.3.rs-68784/v1,10.1186/s13018-020-02188-2,False,True,1.0
62
+ 10.5194/angeo-2021-38,10.5194/angeo-39-1005-2021,True,True,0.996376811594203
63
+ 10.1101/213397,10.1093/molbev/msy059,False,True,0.9847494553376906
64
+ 10.21203/rs.3.rs-122948/v1,10.1186/s12879-021-05787-4,False,True,1.0
65
+ 10.1101/468959,10.1080/1062936x.2012.742136,False,True,
66
+ 10.2196/preprints.10213,10.2196/10213,False,True,0.9743589743589745
67
+ 10.20944/preprints201810.0228.v1,10.3390/children5110151,False,True,1.0
68
+ 10.21203/rs.3.rs-47855/v2,10.1186/s12879-021-05889-z,False,True,1.0
69
+ 10.21203/rs.3.rs-47855/v1,10.1186/s12879-021-05889-z,False,True,0.9910824834496332
70
+ 10.1002/essoar.10511860.1,10.1007/s11356-022-22561-4,False,True,1.0
71
+ 10.1101/139642,10.1371/journal.pone.0192081,False,True,0.9770609318996416
72
+ 10.2196/preprints.39166,10.2196/39166,False,True,1.0
73
+ 10.20944/preprints202212.0232.v1,10.3390/v15020406,False,True,1.0
74
+ 10.5194/amt-2022-196,10.5194/amt-16-707-2023,True,True,1.0
75
+ 10.2196/preprints.19397,10.2196/19397,False,True,1.0
76
+ 10.1101/496752,10.1016/j.optom.2019.10.001,False,True,0.9983660130718954
77
+ 10.1101/261370,10.1167/18.6.9,False,True,1.0
78
+ 10.5194/amt-2017-92,10.5194/amt-11-17-2018,True,True,1.0
79
+ 10.20944/preprints202102.0387.v1,10.3390/foods10030678,False,True,1.0
80
+ 10.20944/preprints201910.0017.v1,10.3390/sym11111390,False,True,1.0
81
+ 10.20944/preprints201707.0061.v1,10.3390/rel8080155,False,True,0.9885057471264368
82
+ 10.5194/wes-2018-35,10.5194/wes-3-615-2018,True,True,0.9987029831387808
83
+ 10.31235/osf.io/t25hr,10.18408/ahuri-7115001,False,True,1.0
84
+ 10.5194/se-2019-99,10.5194/se-11-241-2020,True,True,0.9732770745428972
85
+ 10.5194/amt-2021-113,10.5194/amt-14-6379-2021,True,True,0.9866666666666668
86
+ 10.2196/preprints.17782,10.2196/17782,False,True,1.0
87
+ 10.21203/rs.2.12126/v3,10.1186/s12898-019-0270-8,False,True,0.9933333333333332
88
+ 10.21203/rs.2.12126/v1,10.1186/s12898-019-0270-8,False,True,0.9909178743961352
89
+ 10.21203/rs.2.12126/v2,10.1186/s12898-019-0270-8,False,True,0.9909178743961352
90
+ 10.31235/osf.io/8k7sp,10.4303/jdar/235992,False,True,1.0
91
+ 10.2196/preprints.23357,10.2196/23357,False,True,1.0
92
+ 10.5194/os-2022-15,10.5194/os-18-1163-2022,True,True,0.999250936329588
93
+ 10.5194/npgd-1-1133-2014,10.5194/npg-22-53-2015,True,False,1.0
94
+ 10.2196/preprints.46339,10.2196/46339,False,True,1.0
95
+ 10.2196/preprints.38176,10.2196/38176,False,True,1.0
96
+ 10.2196/preprints.44602,10.2196/44602,False,True,1.0
97
+ 10.2196/preprints.20571,10.2196/20571,False,True,1.0
98
+ 10.2196/preprints.12603,10.2196/12603,False,True,1.0
99
+ 10.20944/preprints201809.0144.v1,10.3390/resources7040076,False,True,0.9727626459143968
100
+ 10.5194/cp-2017-68,10.5194/cp-15-377-2019,True,True,1.0
101
+ 10.1101/402750,10.1098/rsif.2018.0792,False,True,1.0
102
+ 10.31234/osf.io/5bm8r,10.1163/22105832-00902006,False,True,1.0
103
+ 10.21203/rs.3.rs-2156656/v1,10.1038/s41388-023-02692-9,False,True,1.0
104
+ 10.5194/os-2020-51,10.5194/os-17-131-2021,True,True,0.9956140350877192
105
+ 10.20944/preprints202102.0336.v1,10.3390/app11062801,False,True,0.9324894514767932
106
+ 10.2196/preprints.16294,10.2196/16294,False,True,0.9784172661870504
107
+ 10.20944/preprints201809.0477.v1,10.3390/medicina54060099,False,True,0.9989417989417988
108
+ 10.2196/preprints.23400,10.2196/23400,False,True,1.0
109
+ 10.5194/hgss-2023-1,10.5194/hgss-14-61-2023,True,True,1.0
110
+ 10.26434/chemrxiv-2022-k7k0h-v6,10.1021/acs.jpcb.2c03638,False,True,0.9807692307692308
111
+ 10.5194/amt-2022-263,10.5194/amt-16-355-2023,True,True,0.9877675840978594
112
+ 10.1101/446310,10.1371/journal.pntd.0006927,False,True,0.9273689273689274
113
+ 10.1101/537035,10.1021/acs.jctc.0c00476,False,True,1.0
114
+ 10.21203/rs.3.rs-39716/v1,10.1186/s12913-021-06123-x,False,True,0.9987029831387808
115
+ 10.21203/rs.3.rs-39716/v2,10.1186/s12913-021-06123-x,False,True,1.0
116
+ 10.5194/cp-2021-15,10.5194/cp-17-2327-2021,True,True,0.8992248062015503
117
+ 10.21203/rs.3.rs-114221/v1,10.1186/s12889-021-10594-2,False,True,1.0
118
+ 10.21203/rs.3.rs-2014302/v1,10.1038/s41372-023-01642-3,False,True,1.0
119
+ 10.1101/2021.10.21.465319,10.1002/oby.23441,False,True,0.959660297239915
120
+ 10.5194/acpd-14-14637-2014,10.5194/acp-14-12683-2014,True,True,1.0
121
+ 10.32942/osf.io/mxg6q,10.1111/1440-1703.12294,False,True,1.0
122
+ 10.5194/acpd-11-813-2011,10.5194/acp-11-8017-2011,True,True,0.938818565400844
123
+ 10.5194/acp-2016-365,10.5194/acp-17-575-2017,True,True,0.9300395256916996
124
+ 10.5194/amtd-8-8385-2015,10.5194/amt-9-359-2016,True,True,0.9983660130718954
125
+ 10.5194/acpd-8-15101-2008,10.5194/acp-9-1639-2009,True,True,1.0
126
+ 10.5194/acp-2018-739,10.5194/acp-18-17225-2018,True,True,0.9977324263038548
127
+ 10.5194/amt-2016-398,10.5194/amt-10-1911-2017,True,True,1.0
128
+ 10.5194/cpd-9-1703-2013,10.5194/cp-9-1749-2013,True,True,1.0
129
+ 10.5194/bg-2017-74,10.5194/bg-14-3883-2017,True,True,1.0
130
+ 10.5194/acp-2016-178,10.5194/acp-16-11617-2016,True,True,0.9722222222222224
131
+ 10.5194/bgd-12-3211-2015,10.5194/bg-12-3225-2015,True,True,1.0
132
+ 10.5194/tc-2016-111,10.5194/tc-10-2317-2016,True,True,1.0
133
+ 10.5194/acpd-10-21931-2010,10.5194/acp-11-5603-2011,True,True,1.0
134
+ 10.1101/2020.02.29.970913,10.3389/fmicb.2020.01037,False,True,1.0
135
+ 10.5194/tc-2017-2,10.5194/tc-11-2265-2017,True,True,0.943502824858757
136
+ 10.21203/rs.3.rs-35627/v2,10.1186/s12889-020-09979-6,False,True,0.9965277777777776
137
+ 10.21203/rs.3.rs-35627/v3,10.1186/s12889-020-09979-6,False,True,1.0
138
+ 10.21203/rs.3.rs-35627/v4,10.1186/s12889-020-09979-6,False,True,1.0
139
+ 10.21203/rs.3.rs-35627/v5,10.1186/s12889-020-09979-6,False,True,0.9965277777777776
140
+ 10.21203/rs.3.rs-35627/v1,10.1186/s12889-020-09979-6,False,True,0.9910873440285204
141
+ 10.5194/acpd-15-27501-2015,10.5194/acp-16-2477-2016,True,True,0.9437328918048964
142
+ 10.5194/acpd-11-11809-2011,10.5194/acp-12-3627-2012,True,False,1.0
143
+ 10.5194/acpd-14-9801-2014,10.5194/acp-14-9917-2014,True,True,0.9572649572649572
144
+ 10.5194/bgd-11-14269-2014,10.5194/bg-12-1131-2015,True,False,0.9866666666666668
145
+ 10.5194/amt-2016-87,10.5194/amt-9-3769-2016,True,True,0.9743589743589745
146
+ 10.5194/acpd-15-34361-2015,10.5194/acp-16-10501-2016,True,True,0.954940867939686
147
+ 10.5194/acpd-13-2125-2013,10.5194/acp-13-11089-2013,True,True,1.0
148
+ 10.5194/acp-2016-806,10.5194/acp-18-2243-2018,True,True,0.9950980392156864
149
+ 10.5194/cp-2017-57,10.5194/cp-13-1539-2017,True,True,1.0
150
+ 10.5194/tc-2016-29,10.5194/tc-10-2241-2016,True,True,0.9985693848354792
151
+ 10.5194/acpd-9-15747-2009,10.5194/acp-9-8857-2009,True,True,1.0
152
+ 10.26434/chemrxiv.7464803.v1,10.1021/acs.langmuir.9b02574,False,True,0.954861111111111
153
+ 10.5194/nhessd-2-4685-2014,10.5194/nhess-15-109-2015,True,True,0.9206349206349206
154
+ 10.31234/osf.io/5v4wt,10.1016/j.beproc.2017.04.017,False,True,
155
+ 10.5194/hessd-12-9003-2015,10.5194/hess-20-605-2016,True,False,0.9679291983488598
156
+ 10.5194/acp-2016-692,10.5194/acp-17-3279-2017,True,True,0.9863013698630138
157
+ 10.5194/acpd-10-7469-2010,10.5194/acp-10-9017-2010,True,True,1.0
158
+ 10.1101/115253,10.1093/cercor/bhx259,False,True,1.0
159
+ 10.5194/acpd-11-25709-2011,10.5194/acp-11-12959-2011,True,True,1.0
160
+ 10.5194/acpd-12-28765-2012,10.5194/acp-13-2857-2013,True,True,1.0
161
+ 10.5194/gmdd-8-5315-2015,10.5194/gmd-9-17-2016,True,True,1.0
162
+ 10.5194/gmd-2016-63,10.5194/gmd-9-3199-2016,True,True,1.0
163
+ 10.5194/acp-2017-319,10.5194/acp-17-13699-2017,True,True,0.8735930735930736
164
+ 10.5194/bgd-10-2415-2013,10.5194/bg-10-5171-2013,True,True,0.9637681159420288
165
+ 10.5194/bgd-10-9315-2013,10.5194/bg-10-7347-2013,True,True,0.9975669099756692
166
+ 10.5194/soild-2-29-2015,10.5194/soil-1-475-2015,True,True,0.9957446808510638
167
+ 10.5194/se-2017-18,10.5194/se-8-955-2017,True,True,1.0
168
+ 10.1101/012195,10.1007/s10827-015-0574-4,False,True,0.988095238095238
169
+ 10.5194/gmdd-6-2491-2013,10.5194/gmd-7-1183-2014,True,True,0.993127147766323
170
+ 10.5194/gmd-2017-293,10.5194/gmd-11-1971-2018,True,True,0.998272884283247
171
+ 10.5194/gmdd-5-1381-2012,10.5194/gmd-6-57-2013,True,True,1.0
172
+ 10.1101/2021.04.28.441869,10.1016/j.ymthe.2022.01.030,False,True,1.0
173
+ 10.1101/568790,10.3389/fimmu.2019.01066,False,True,0.9709639953542392
174
+ 10.1101/560144,10.26508/lsa.201900358,False,True,1.0
175
+ 10.1101/2020.06.22.164814,10.1152/jn.00110.2021,False,True,1.0
176
+ 10.5194/amtd-7-11345-2014,10.5194/amt-8-2491-2015,True,True,1.0
177
+ 10.20944/preprints201803.0185.v1,10.3390/ijms19051364,False,True,0.9803921568627452
178
+ 10.5194/acpd-8-3895-2008,10.5194/acp-8-4655-2008,True,True,1.0
179
+ 10.5194/hess-2016-400,10.5194/hess-20-5049-2016,True,True,1.0
180
+ 10.5194/soild-2-647-2015,10.5194/soil-2-1-2016,True,True,1.0
181
+ 10.5194/nhess-2016-210,10.5194/nhess-16-2347-2016,True,True,0.9388560157790928
182
+ 10.5194/gmdd-6-1085-2013,10.5194/gmd-6-1641-2013,True,True,1.0
183
+ 10.5194/gmd-2016-114,10.5194/gmd-9-3605-2016,True,True,0.933719101675758
184
+ 10.5194/acpd-14-19837-2014,10.5194/acp-15-913-2015,True,True,0.9904761904761904
185
+ 10.5194/amtd-8-10755-2015,10.5194/amt-9-1613-2016,True,True,0.9866666666666668
186
+ 10.5194/acpd-15-931-2015,10.5194/acp-15-6535-2015,True,True,0.9878542510121456
187
+ 10.5194/acpd-14-2277-2014,10.5194/acp-14-6557-2014,True,True,0.998003992015968
188
+ 10.5194/acpd-12-16647-2012,10.5194/acp-12-11795-2012,True,True,1.0
189
+ 10.20944/preprints201805.0070.v1,10.3390/diagnostics8020041,False,True,0.9658260233918128
190
+ 10.20944/preprints202105.0096.v1,10.3390/nu13061875,False,True,0.993992722253592
191
+ 10.5194/acp-2018-37,10.5194/acp-18-6761-2018,True,True,1.0
192
+ 10.5194/amtd-5-2111-2012,10.5194/amt-5-1719-2012,True,True,1.0
193
+ 10.26434/chemrxiv.8220599.v1,10.1021/acs.chemrestox.9b00255,False,True,0.996031746031746
194
+ 10.5194/osd-11-1719-2014,10.5194/os-11-269-2015,True,True,1.0
195
+ 10.5194/osd-11-693-2014,10.5194/os-10-587-2014,True,True,1.0
196
+ 10.5194/cp-2017-49,10.5194/cp-14-157-2018,True,True,0.987468671679198
197
+ 10.5194/acpd-11-4533-2011,10.5194/acp-11-6721-2011,True,False,1.0
198
+ 10.5194/sed-6-2567-2014,10.5194/se-5-1169-2014,True,True,0.9886264216972878
199
+ 10.5194/cpd-8-1523-2012,10.5194/cp-8-1801-2012,True,True,0.9982547993019196
200
+ 10.5194/gmd-2016-37,10.5194/gmd-9-3111-2016,True,True,0.9759036144578314
201
+ 10.5194/soil-2016-63,10.5194/soil-3-67-2017,True,True,0.9228395061728396
202
+ 10.5194/acp-2016-7,10.5194/acp-16-7653-2016,True,True,0.978593272171254
203
+ 10.1101/165357,10.1371/journal.pcbi.1005868,False,True,1.0
204
+ 10.5194/nhess-2016-66,10.5194/nhess-16-2247-2016,True,True,1.0
205
+ 10.2196/preprints.9154,10.2196/resprot.9154,False,True,1.0
206
+ 10.1101/568212,10.1098/rspb.2019.1818,False,True,1.0
207
+ 10.5194/gmdd-7-931-2014,10.5194/gmd-7-2411-2014,True,True,0.9975669099756692
208
+ 10.1101/2021.06.19.449118,10.1158/2767-9764.crc-22-0003,False,True,
209
+ 10.2196/preprints.20509,10.2196/20509,False,True,1.0
210
+ 10.5194/osd-10-691-2013,10.5194/os-9-885-2013,True,True,0.9607843137254902
211
+ 10.20944/preprints202007.0409.v1,10.3390/en13174331,False,True,1.0
212
+ 10.5194/se-2016-55,10.5194/se-7-1085-2016,True,True,0.9526748971193416
213
+ 10.5194/nhessd-1-3891-2013,10.5194/nhess-14-1257-2014,True,True,0.930117899249732
214
+ 10.5194/acpd-13-32291-2013,10.5194/acp-14-7485-2014,True,True,1.0
215
+ 10.5194/acpd-6-3099-2006,10.5194/acp-6-3243-2006,True,True,0.9629629629629628
216
+ 10.26434/chemrxiv-2022-dnl9p,10.1021/acs.inorgchem.2c01171,False,True,1.0
217
+ 10.5194/bgd-10-19311-2013,10.5194/bg-11-4015-2014,True,True,1.0
218
+ 10.5194/bgd-10-7013-2013,10.5194/bg-10-6807-2013,True,True,
219
+ 10.5194/bgd-10-2305-2013,10.5194/bg-10-7263-2013,True,True,1.0
220
+ 10.1101/2022.01.11.475674,10.1016/j.nbd.2022.105879,False,True,1.0
221
+ 10.1101/517243,10.1016/j.neuroimage.2019.116175,False,True,1.0
222
+ 10.20944/preprints201808.0196.v1,10.3390/e20110840,False,True,0.949874686716792
223
+ 10.5194/acpd-14-24573-2014,10.5194/acp-15-6047-2015,True,True,0.9977324263038548
224
+ 10.5194/hessd-9-2717-2012,10.5194/hess-16-3817-2012,True,True,0.8919753086419754
225
+ 10.2196/preprints.42403,10.2196/42403,False,True,1.0
226
+ 10.2196/preprints.15105,10.2196/15105,False,True,0.9085858585858584
227
+ 10.5194/acpd-12-14115-2012,10.5194/acp-12-11037-2012,True,True,1.0
228
+ 10.5194/acpd-12-6593-2012,10.5194/acp-12-10331-2012,True,True,0.9982547993019196
229
+ 10.5194/acpd-12-20007-2012,10.5194/acp-13-3849-2013,True,True,1.0
230
+ 10.2196/preprints.9966,10.2196/jmir.9966,False,True,1.0
231
+ 10.5194/osd-3-939-2006,10.5194/os-3-129-2007,True,True,1.0
232
+ 10.1101/2020.02.16.951954,10.15252/embj.2020104708,False,True,0.9681704260651628
233
+ 10.5194/bgd-8-941-2011,10.5194/bg-8-2523-2011,True,True,0.9915764139590856
234
+ 10.5194/bgd-8-7165-2011,10.5194/bg-8-3609-2011,True,True,1.0
235
+ 10.1101/791319,10.1523/jneurosci.2416-19.2020,False,True,0.990138067061144
236
+ 10.5194/tcd-8-5361-2014,10.5194/tc-9-103-2015,True,True,1.0
237
+ 10.5194/tcd-6-5119-2012,10.5194/tc-7-1139-2013,True,True,0.9551724137931036
238
+ 10.1101/2021.10.23.465582,10.1523/jneurosci.2145-21.2022,False,True,1.0
239
+ 10.5194/tcd-6-2265-2012,10.5194/tc-7-1-2013,True,True,0.989384288747346
240
+ 10.2196/preprints.13662,10.2196/13662,False,True,1.0
241
+ 10.31231/osf.io/3pxzd,10.1037/ccp0000092,False,True,0.9423740510697032
242
+ 10.5194/hessd-7-9173-2010,10.5194/hess-15-1339-2011,True,True,1.0
243
+ 10.5194/bgd-9-14291-2012,10.5194/bg-10-5079-2013,True,True,1.0
244
+ 10.5194/acpd-11-4631-2011,10.5194/acp-11-7629-2011,True,True,1.0
245
+ 10.2196/preprints.17542,10.2196/17542,False,True,1.0
246
+ 10.5194/hessd-7-621-2010,10.5194/hess-14-719-2010,True,True,1.0
247
+ 10.5194/acpd-12-1451-2012,10.5194/acp-12-5755-2012,True,True,0.998015873015873
248
+ 10.5194/gmdd-6-3655-2013,10.5194/gmd-6-2153-2013,True,True,0.9890611279972982
249
+ 10.1101/2021.07.14.452404,10.1111/2041-210x.13821,False,True,0.9930555555555556
250
+ 10.5194/bgd-10-17071-2013,10.5194/bg-11-3131-2014,True,True,0.99457111834962
251
+ 10.1101/600312,10.1111/tpj.14670,False,True,0.9085648148148148
252
+ 10.5194/cpd-7-775-2011,10.5194/cp-7-917-2011,True,True,1.0
253
+ 10.5194/acpd-13-27779-2013,10.5194/acp-14-2383-2014,True,True,0.9473684210526316
254
+ 10.5194/esdd-5-779-2014,10.5194/esd-5-423-2014,True,False,1.0
255
+ 10.21203/rs.2.11587/v2,10.1186/s12884-019-2590-2,False,True,1.0
256
+ 10.21203/rs.2.11587/v1,10.1186/s12884-019-2590-2,False,True,1.0
257
+ 10.21203/rs.2.11587/v4,10.1186/s12884-019-2590-2,False,True,1.0
258
+ 10.21203/rs.2.11587/v3,10.1186/s12884-019-2590-2,False,True,1.0
259
+ 10.21203/rs.2.11587/v5,10.1186/s12884-019-2590-2,False,True,1.0
260
+ 10.2196/preprints.12347,10.2196/12347,False,True,1.0
261
+ 10.5194/se-2016-11,10.5194/se-7-599-2016,True,True,0.9022946859903382
262
+ 10.5194/bg-2015-647,10.5194/bg-13-5511-2016,True,True,1.0
263
+ 10.1101/087577,10.1371/journal.pgen.1006793,False,True,0.912630579297246
264
+ 10.5194/acpd-9-13327-2009,10.5194/acp-9-8651-2009,True,True,1.0
265
+ 10.5194/hessd-6-4307-2009,10.5194/hess-13-2151-2009,True,True,1.0
266
+ 10.5194/acpd-11-4807-2011,10.5194/acp-11-6297-2011,True,True,0.9947916666666666
267
+ 10.5194/acpd-11-8337-2011,10.5194/acp-11-8415-2011,True,True,1.0
268
+ 10.1101/233924,10.1002/ece3.3872,False,True,0.9987129987129988
269
+ 10.21203/rs.3.rs-997649/v1,10.1007/s10637-022-01218-6,False,True,0.9977477477477475
270
+ 10.1101/359018,10.1152/jn.00601.2018,False,True,1.0
271
+ 10.1101/247189,10.1111/evo.13573,False,True,1.0
272
+ 10.5194/bgd-10-14093-2013,10.5194/bg-10-8223-2013,True,True,0.9901960784313726
273
+ 10.20944/preprints202009.0582.v1,10.3390/jmse8100756,False,True,1.0
274
+ 10.1101/370155,10.1371/journal.pone.0201329,False,True,0.9853249475890984
275
+ 10.2196/preprints.18338,10.2196/18338,False,True,1.0
276
+ 10.5194/acpd-11-13867-2011,10.5194/acp-11-10911-2011,True,True,1.0
277
+ 10.21203/rs.3.rs-61529/v2,10.1186/s13049-020-00818-6,False,True,0.98989898989899
278
+ 10.21203/rs.3.rs-61529/v1,10.1186/s13049-020-00818-6,False,True,1.0
279
+ 10.1101/868307,10.1093/cercor/bhaa146,False,True,1.0
280
+ 10.5194/acpd-11-9887-2011,10.5194/acp-11-11867-2011,True,True,0.9621952608794714
281
+ 10.5194/acpd-11-11649-2011,10.5194/acp-11-12751-2011,True,True,1.0
282
+ 10.5194/bgd-12-15495-2015,10.5194/bg-13-3619-2016,True,True,0.903925364758698
283
+ 10.5194/bgd-12-7705-2015,10.5194/bg-12-5277-2015,True,True,1.0
284
+ 10.1101/2020.04.14.041145,10.1158/1078-0432.ccr-20-1762,False,True,0.9487179487179488
285
+ 10.5194/acpd-10-24245-2010,10.5194/acp-11-767-2011,True,True,0.9993238674780256
286
+ 10.31234/osf.io/k4anx,10.1186/s12887-019-1818-7,False,True,1.0
287
+ 10.5194/bgd-10-19005-2013,10.5194/bg-11-2793-2014,True,True,1.0
288
+ 10.20944/preprints202004.0309.v1,10.3390/rs12111748,False,True,1.0
289
+ 10.5194/acpd-9-16715-2009,10.5194/acp-10-1269-2010,True,True,1.0
290
+ 10.5194/hessd-12-12615-2015,10.5194/hess-20-2691-2016,True,True,
291
+ 10.5194/amtd-7-1917-2014,10.5194/amt-7-2097-2014,True,True,1.0
292
+ 10.5194/amtd-6-1771-2013,10.5194/amt-6-1903-2013,True,True,1.0
293
+ 10.5194/hessd-10-2373-2013,10.5194/hess-18-595-2014,True,True,
294
+ 10.5194/acpd-13-2913-2013,10.5194/acp-13-6473-2013,True,True,0.9876543209876544
295
+ 10.5194/bgd-10-17043-2013,10.5194/bg-11-2519-2014,True,True,
296
+ 10.5194/acpd-15-6125-2015,10.5194/acp-15-9003-2015,True,True,1.0
297
+ 10.5194/acpd-9-6397-2009,10.5194/acp-9-5093-2009,True,True,0.9914529914529916
298
+ 10.5194/acpd-10-23657-2010,10.5194/acp-11-1621-2011,True,True,1.0
299
+ 10.5194/bgd-11-10917-2014,10.5194/bg-11-7025-2014,True,True,0.9966329966329966
300
+ 10.5194/acpd-13-18951-2013,10.5194/acp-13-11169-2013,True,True,1.0
301
+ 10.5194/hessd-4-3087-2007,10.5194/hess-12-405-2008,True,True,1.0
302
+ 10.5194/bgd-9-19121-2012,10.5194/bg-10-2315-2013,True,False,0.9976359338061466
303
+ 10.2196/preprints.11364,10.2196/11364,False,True,1.0
304
+ 10.5194/acpd-11-19011-2011,10.5194/acp-12-11085-2012,True,True,
305
+ 10.5194/sed-5-257-2013,10.5194/se-4-255-2013,True,True,1.0
306
+ 10.31234/osf.io/84uqz,10.1111/psyp.14242,False,True,0.9692307692307692
307
+ 10.5194/esurfd-1-745-2013,10.5194/esurf-2-363-2014,True,True,1.0
308
+ 10.21203/rs.3.rs-474980/v1,10.1038/s42003-021-02885-6,False,True,0.998148148148148
309
+ 10.1101/2021.10.15.464543,10.1021/acs.jcim.1c01269,False,True,1.0
310
+ 10.5194/hessd-10-9847-2013,10.5194/hess-17-5213-2013,True,True,1.0
311
+ 10.5194/acpd-11-30757-2011,10.5194/acp-12-4885-2012,True,True,1.0
312
+ 10.5194/tcd-8-3367-2014,10.5194/tc-9-65-2015,True,True,1.0
313
+ 10.5194/acpd-14-19515-2014,10.5194/acp-15-99-2015,True,False,1.0
314
+ 10.5194/bgd-8-5849-2011,10.5194/bg-9-593-2012,True,True,1.0
315
+ 10.5194/bgd-10-17549-2013,10.5194/bg-11-4459-2014,True,True,1.0
316
+ 10.5194/acpd-14-4189-2014,10.5194/acp-14-7075-2014,True,True,1.0
317
+ 10.5194/hessd-9-5531-2012,10.5194/hess-16-3749-2012,True,False,0.9987325728770596
318
+ 10.5194/bgd-9-2153-2012,10.5194/bg-9-2301-2012,True,True,0.9831649831649832
319
+ 10.5194/acpd-9-5809-2009,10.5194/acp-9-6479-2009,True,False,1.0
320
+ 10.5194/cpd-9-1735-2013,10.5194/cp-9-1773-2013,True,True,0.9965277777777776
321
+ 10.5194/osd-7-995-2010,10.5194/os-7-175-2011,True,True,0.9966666666666668
322
+ 10.1101/116426,10.1088/1478-3975/aa6b67,False,True,1.0
323
+ 10.5194/tc-2016-199,10.5194/tc-11-47-2017,True,True,1.0
324
+ 10.5194/tcd-8-4823-2014,10.5194/tc-9-53-2015,True,True,1.0
325
+ 10.5194/cp-2016-131,10.5194/cp-13-1153-2017,True,True,1.0
326
+ 10.26434/chemrxiv.11514189.v1,10.1002/anie.201915493,False,True,0.9611046776853706
327
+ 10.26434/chemrxiv.11514189.v2,10.1002/anie.201915493,False,True,0.9611046776853706
328
+ 10.1101/2020.12.18.423427,10.1038/s41422-021-00495-9,False,True,0.9445194182036288
329
+ 10.5194/cpd-11-3277-2015,10.5194/cp-12-455-2016,True,True,
330
+ 10.21203/rs.2.14334/v2,10.1186/s12909-019-1876-4,False,True,1.0
331
+ 10.21203/rs.2.14334/v3,10.1186/s12909-019-1876-4,False,True,1.0
332
+ 10.21203/rs.2.14334/v1,10.1186/s12909-019-1876-4,False,True,1.0
333
+ 10.1101/2021.04.26.441285,10.1021/acsnano.1c06488,False,True,1.0
334
+ 10.5194/esd-2020-80,10.5194/esd-12-367-2021,True,True,0.9983249581239532
335
+ 10.20944/preprints201810.0612.v1,10.3390/electronics7120347,False,True,0.925925925925926
336
+ 10.1101/209718,10.1038/s41592-018-0002-6,False,True,0.9055876685934487
337
+ 10.5194/cpd-11-3143-2015,10.5194/cp-12-819-2016,True,True,0.9324444444444444
338
+ 10.5194/cpd-9-5837-2013,10.5194/cp-10-759-2014,True,True,0.9959349593495936
339
+ 10.2196/preprints.9498,10.2196/jmir.9498,False,True,1.0
340
+ 10.2196/preprints.9498.a,10.2196/jmir.9498,False,True,1.0
341
+ 10.5194/cp-2018-60,10.5194/cp-15-1063-2019,True,True,0.8722741433021807
342
+ 10.5194/acp-2022-387,10.5194/acp-22-13897-2022,True,True,0.9963099630996308
343
+ 10.5194/acp-2021-870,10.5194/acp-22-12961-2022,True,True,0.9591397849462364
344
+ 10.5194/bg-2019-145,10.5194/bg-16-3377-2019,True,True,1.0
345
+ 10.5194/tc-2018-131,10.5194/tc-13-219-2019,True,True,0.9820193637621024
346
+ 10.5194/gmdd-8-7063-2015,10.5194/gmd-9-1293-2016,True,True,0.91999806765503
347
+ 10.1101/223248,10.1093/jnci/djy081,False,True,0.9974747474747474
348
+ 10.5194/hess-2021-2,10.5194/hess-25-5749-2021,True,True,0.9780786589297228
349
+ 10.5194/hessd-10-14705-2013,10.5194/hess-19-389-2015,True,True,0.9164912280701756
350
+ 10.5194/hessd-9-10563-2012,10.5194/hess-17-817-2013,True,False,1.0
351
+ 10.5194/hessd-11-9183-2014,10.5194/hess-19-1247-2015,True,False,1.0
352
+ 10.1101/542282,10.1093/jxb/erz182,False,True,0.971326164874552
353
+ 10.5194/essd-2022-239,10.5194/essd-15-1675-2023,True,True,0.9861239592969472
354
+ 10.1101/2020.08.04.237156,10.1002/jev2.12079,False,True,0.9973544973544972
355
+ 10.1101/198671,10.1038/s41593-019-0359-6,False,True,1.0
356
+ 10.1101/2020.07.13.200360,10.1371/journal.pone.0236612,False,True,0.9743589743589745
357
+ 10.1101/462861,10.1371/journal.pone.0207555,False,True,0.9920496894409938
358
+ 10.20944/preprints201608.0123.v1,10.3390/s16081290,False,True,0.9979423868312756
359
+ 10.5194/acpd-15-14889-2015,10.5194/acp-15-11165-2015,True,True,1.0
360
+ 10.26434/chemrxiv-2021-70pvw,10.1021/acs.chemmater.1c04167,False,True,1.0
361
+ 10.5194/hessd-8-5319-2011,10.5194/hess-15-2839-2011,True,True,1.0
362
+ 10.5194/hessd-12-1809-2015,10.5194/hess-20-3873-2016,True,True,0.9321789321789322
363
+ 10.5194/hess-2022-60,10.5194/hess-26-6399-2022,True,True,0.9314420803782508
364
+ 10.5194/hessd-2-2427-2005,10.5194/hess-10-535-2006,True,True,1.0
365
+ 10.5194/hessd-11-6881-2014,10.5194/hess-19-1225-2015,True,False,1.0
366
+ 10.5194/hess-2019-461,10.5194/hess-24-3015-2020,True,True,0.9784172661870504
367
+ 10.1101/2021.06.16.448617,10.1084/jem.20211112,False,True,0.96
368
+ 10.1101/2020.02.16.942904,10.3390/cancers12051171,False,True,0.9784172661870504
369
+ 10.5194/gmd-2022-173,10.5194/gmd-16-1617-2023,True,True,0.9973958333333334
370
+ 10.5194/hessd-8-9961-2011,10.5194/hess-16-1445-2012,True,False,1.0
371
+ 10.21034/sr.410,10.1086/666589,False,True,1.0
372
+ 10.5194/acpd-11-32601-2011,10.5194/acp-12-3273-2012,True,True,1.0
373
+ 10.5194/hess-2018-334,10.5194/hess-22-5987-2018,True,True,0.9670781893004116
374
+ 10.1101/2022.07.01.498411,10.7554/elife.81184,True,False,0.8990378213475783
375
+ 10.1101/374660,10.1016/j.celrep.2018.10.079,False,True,0.9166666666666666
376
+ 10.21203/rs.3.rs-87483/v1,10.1186/s40658-020-00350-7,False,True,1.0
377
+ 10.1101/2020.08.27.269647,10.1186/s40478-020-01068-4,False,True,0.9691358024691358
378
+ 10.1101/028886,10.1038/nature17661,False,True,0.992156862745098
379
+ 10.1101/2021.08.11.455980,10.7554/elife.83652,True,False,0.9791666666666666
380
+ 10.1101/596569,10.1016/j.celrep.2019.10.056,False,True,1.0
381
+ 10.5194/mr-2020-13,10.5194/mr-1-209-2020,True,True,0.9714285714285714
382
+ 10.26434/chemrxiv-2021-t1b6t,10.1021/jacs.2c03024,False,True,
383
+ 10.5194/mr-2021-9,10.5194/mr-2-375-2021,True,True,1.0
384
+ 10.5194/mr-2020-5,10.5194/mr-1-59-2020,True,True,1.0
385
+ 10.5194/acpd-15-10899-2015,10.5194/acp-15-8751-2015,True,True,1.0
386
+ 10.1101/529156,10.1371/journal.pgen.1008458,False,True,0.9487922705314008
387
+ 10.1101/2021.09.14.460327,10.1523/eneuro.0373-21.2022,False,True,0.875
388
+ 10.5194/bg-2018-512,10.5194/bg-16-2635-2019,True,True,1.0
389
+ 10.5194/acp-2020-17,10.5194/acp-20-9281-2020,True,True,0.9971014492753624
390
+ 10.1101/2021.11.03.467174,10.7554/elife.75272,True,True,1.0
391
+ 10.26434/chemrxiv.7990910.v2,10.1021/acs.jcim.9b00325,False,True,0.98635477582846
392
+ 10.26434/chemrxiv.7990910.v1,10.1021/acs.jcim.9b00325,False,True,0.98635477582846
393
+ 10.21203/rs.3.rs-677091/v1,10.1038/s41467-021-26199-7,False,True,0.993103448275862
394
+ 10.1101/2020.04.02.022541,10.1021/acschembio.0c00348,False,True,1.0
395
+ 10.5194/wes-2021-156,10.5194/wes-7-2307-2022,True,True,1.0
396
+ 10.1101/050237,10.1371/journal.pone.0170622,False,True,0.9890453834115808
397
+ 10.5194/osd-12-135-2015,10.5194/os-11-629-2015,True,True,1.0
398
+ 10.1101/072470,10.1038/s41586-018-0124-0,False,True,
399
+ 10.31234/osf.io/dbkj6,10.1111/bjso.12399,False,True,0.9777777777777776
400
+ 10.21203/rs.3.rs-136113/v1,10.1186/s13011-021-00358-x,False,True,1.0
401
+ 10.26434/chemrxiv.9756785.v1,10.1021/acsmedchemlett.9b00399,False,True,0.9969135802469136
402
+ 10.1101/2021.04.23.441115,10.7554/elife.69223,True,True,0.9528769841269842
403
+ 10.1101/328211,10.1007/s00339-019-2480-5,False,True,0.9629629629629628
404
+ 10.1101/2022.10.06.511106,10.7554/elife.83761,True,False,1.0
405
+ 10.1101/146852,10.1038/s41589-018-0013-8,False,True,0.9523809523809524
406
+ 10.31234/osf.io/vxa86,10.1037/met0000179,False,True,0.9985569985569984
407
+ 10.21203/rs.2.20459/v1,10.1186/s12870-020-2311-z,False,True,0.9845288326300984
408
+ 10.21203/rs.2.20459/v2,10.1186/s12870-020-2311-z,False,True,1.0
409
+ 10.21203/rs.2.20459/v3,10.1186/s12870-020-2311-z,False,True,1.0
410
+ 10.1101/122044,10.1371/journal.pcbi.1005890,False,True,1.0
411
+ 10.2196/preprints.14675,10.2196/14675,False,True,1.0
412
+ 10.32942/osf.io/s5dnr,10.1111/jeb.13728,False,True,0.942857142857143
413
+ 10.1101/2020.10.28.358846,10.1071/fp21337,False,True,0.93
414
+ 10.1101/2021.06.26.449853,10.1002/glia.24190,False,True,1.0
415
+ 10.1101/2021.04.13.439588,10.7554/elife.69377,True,True,0.9671445639187576
416
+ 10.5194/acp-2020-1041,10.5194/acp-21-9909-2021,True,True,0.9969418960244648
417
+ 10.5194/sed-4-1069-2012,10.5194/se-3-355-2012,True,True,1.0
418
+ 10.5194/hess-2021-41,10.5194/hess-25-4917-2021,True,True,0.9851387437594336
419
+ 10.21203/rs.3.rs-35889/v1,10.1186/s12883-020-01958-z,False,True,0.9662234998203376
420
+ 10.21203/rs.3.rs-35889/v2,10.1186/s12883-020-01958-z,False,True,1.0
421
+ 10.1101/2022.05.10.491316,10.7554/elife.78810,True,False,1.0
422
+ 10.2196/preprints.17997,10.2196/17997,False,True,1.0
423
+ 10.1101/317552,10.1038/s41396-018-0240-8,False,True,
424
+ 10.1101/719922,10.7554/elife.88350,True,False,
425
+ 10.2196/preprints.12957,10.2196/12957,False,True,1.0
426
+ 10.1101/322388,10.1016/j.celrep.2019.05.006,False,True,0.9878183831672204
427
+ 10.1101/431718,10.1016/j.neuroimage.2019.03.019,False,True,1.0
428
+ 10.21203/rs.3.rs-97961/v2,10.1186/s12933-021-01222-9,False,True,1.0
429
+ 10.21203/rs.3.rs-97961/v1,10.1186/s12933-021-01222-9,False,True,1.0
430
+ 10.5194/gmd-2019-295,10.5194/gmd-13-873-2020,True,True,0.988835725677831
431
+ 10.1101/294587,10.1016/j.eclinm.2019.06.003,False,True,0.9437857708706062
432
+ 10.1101/426957,10.1186/s12864-018-5299-0,False,True,1.0
433
+ 10.21203/rs.2.16448/v1,10.1186/s12864-020-6471-x,False,True,1.0
434
+ 10.21203/rs.2.16448/v2,10.1186/s12864-020-6471-x,False,True,1.0
435
+ 10.1101/2021.02.09.430442,10.1523/jneurosci.0556-21.2021,False,True,0.9092592592592592
436
+ 10.5194/essd-2020-16,10.5194/essd-12-1789-2020,True,True,0.9985693848354792
437
+ 10.21203/rs.3.rs-65516/v1,10.1186/s12864-021-07431-6,False,True,1.0
438
+ 10.21203/rs.3.rs-65516/v2,10.1186/s12864-021-07431-6,False,True,1.0
439
+ 10.1101/370874,10.1186/s41073-019-0069-3,False,True,
440
+ 10.20944/preprints201908.0008.v1,10.3390/ma12182960,False,True,0.9965635738831616
441
+ 10.1101/2020.04.24.059840,10.15252/embj.2019104136,False,True,0.9988344988344988
442
+ 10.5194/hessd-5-2791-2008,10.5194/hess-13-467-2009,True,True,1.0
443
+ 10.21034/wp.274,10.2307/1391384,False,True,0.9977324263038548
444
+ 10.21034/sr.498,10.1086/707735,False,True,
445
+ 10.5194/hessd-12-8091-2015,10.5194/hess-20-175-2016,True,True,0.9803921568627452
446
+ 10.1101/2020.03.02.972521,10.1016/j.foreco.2020.118344,False,True,1.0
447
+ 10.5194/acp-2015-1028,10.5194/acp-16-6041-2016,True,True,0.9331369079944484
448
+ 10.20944/preprints202011.0348.v1,10.3390/en14030635,False,True,1.0
449
+ 10.20944/preprints201609.0106.v2,10.3390/mca22010017,False,True,0.9957446808510638
450
+ 10.20944/preprints201609.0106.v1,10.3390/mca22010017,False,True,0.9957446808510638
451
+ 10.21203/rs.3.rs-38976/v2,10.1186/s12960-020-00532-5,False,True,0.9913644214162348
452
+ 10.21203/rs.3.rs-38976/v1,10.1186/s12960-020-00532-5,False,True,0.9913644214162348
453
+ 10.21203/rs.3.rs-38976/v3,10.1186/s12960-020-00532-5,False,True,1.0
454
+ 10.5194/acpd-4-399-2004,10.5194/acp-4-801-2004,True,True,0.9197530864197532
455
+ 10.5194/acp-2021-58,10.5194/acp-21-13483-2021,True,True,1.0
456
+ 10.5194/cpd-7-4173-2011,10.5194/cp-8-855-2012,True,True,1.0
457
+ 10.5194/hess-2022-117,10.5194/hess-26-4953-2022,True,True,1.0
458
+ 10.1101/059329,10.1093/nar/gkw627,False,True,1.0
459
+ 10.1101/054247,10.1016/j.neuron.2016.08.007,False,True,1.0
460
+ 10.5194/bg-2017-53,10.5194/bg-15-13-2018,True,True,1.0
461
+ 10.5194/acpd-2-1735-2002,10.5194/acp-3-303-2003,True,True,1.0
462
+ 10.1101/2021.06.09.447533,10.7554/elife.71569,True,False,0.979381443298969
463
+ 10.26434/chemrxiv.14541432.v1,10.1021/acscentsci.1c00592,False,True,1.0
464
+ 10.5194/esurfd-1-1-2013,10.5194/esurf-1-1-2013,True,True,1.0
465
+ 10.21203/rs.3.rs-995821/v1,10.1007/s10533-022-00915-x,False,True,1.0
466
+ 10.26434/chemrxiv.11985357.v1,10.1021/acschemneuro.0c00479,False,True,0.9789397240377632
467
+ 10.26434/chemrxiv.11985357,10.1021/acschemneuro.0c00479,False,True,0.9984567901234568
468
+ 10.1101/185520,10.1038/npp.2017.250,False,True,1.0
469
+ 10.21203/rs.3.rs-39782/v1,10.1186/s12885-021-07994-3,False,True,
470
+ 10.21203/rs.3.rs-39782/v2,10.1186/s12885-021-07994-3,False,True,1.0
471
+ 10.21034/sr.516,10.1257/aer.20151260,False,True,1.0
472
+ 10.2196/preprints.16665,10.2196/16665,False,True,1.0
473
+ 10.20944/preprints201912.0205.v1,10.3390/ijerph17020616,False,True,1.0
474
+ 10.20944/preprints201912.0205.v2,10.3390/ijerph17020616,False,True,1.0
475
+ 10.1101/465096,10.1371/journal.ppat.1007460,False,True,0.9405399726862064
476
+ 10.1101/031260,10.1103/physrevlett.116.248101,False,True,1.0
477
+ 10.31234/osf.io/p5gns,10.1017/s0033291721001306,False,True,0.9601748959617086
478
+ 10.1101/2020.07.16.207662,10.1002/advs.202001572,False,True,1.0
479
+ 10.1101/135814,10.1093/nar/gkx607,False,True,0.9864208543958768
480
+ 10.1101/2021.05.03.442388,10.1523/jneurosci.0933-21.2021,False,True,1.0
481
+ 10.1101/542381,10.15252/msb.20209880,False,True,0.9792843691148776
482
+ 10.1101/2020.09.28.316653,10.1016/j.molcel.2020.10.031,False,True,1.0
483
+ 10.1101/370775,10.1136/bmjopen-2018-026211,False,True,
484
+ 10.20944/preprints202009.0192.v1,10.3390/cancers12102798,False,True,0.9914529914529916
485
+ 10.1101/430124,10.26508/lsa.201800162,False,True,1.0
486
+ 10.1101/2022.09.05.506603,10.7554/elife.83153,True,False,0.9729729729729728
487
+ 10.5194/acp-2020-543,10.5194/acp-21-3395-2021,True,True,0.963226571767497
488
+ 10.5194/essd-2019-118,10.5194/essd-12-789-2020,True,True,1.0
489
+ 10.1101/2020.05.04.077040,10.1016/j.nicl.2020.102353,False,True,1.0
490
+ 10.5194/acp-2020-909,10.5194/acp-21-8915-2021,True,True,1.0
491
+ 10.1101/2020.05.06.081356,10.1002/bit.27473,False,True,0.9810874704491724
492
+ 10.1101/2020.10.29.355859,10.1038/s41388-021-01876-5,False,True,0.9841269841269842
493
+ 10.5194/acp-2020-674,10.5194/acp-21-2305-2021,True,True,0.9862258953168044
494
+ 10.26434/chemrxiv.9684470.v1,10.1021/acsmacrolett.9b00717,False,True,0.9807852965747702
495
+ 10.5194/acp-2019-580,10.5194/acp-20-753-2020,True,True,0.9407407407407408
496
+ 10.5194/tc-2022-217,10.5194/tc-17-3593-2023,True,True,1.0
497
+ 10.5194/amt-2018-397,10.5194/amt-12-2819-2019,True,True,0.9985569985569984
498
+ 10.5194/acpd-11-17879-2011,10.5194/acp-11-9237-2011,True,True,1.0
499
+ 10.1101/2020.03.13.990887,10.1038/s41594-020-0465-x,False,True,0.9803921568627452
500
+ 10.5194/acpd-14-7141-2014,10.5194/acp-14-10411-2014,True,True,0.948073701842546
501
+ 10.26434/chemrxiv.12743720,10.1021/acssensors.0c02264,False,True,0.9817042606516292
502
+ 10.26434/chemrxiv.12743720.v1,10.1021/acssensors.0c02264,False,True,0.986466165413534
503
+ 10.21203/rs.3.rs-2145653/v1,10.1038/s41388-022-02585-3,False,True,0.914092014536055
504
+ 10.1101/2020.05.20.106575,10.15252/embj.2020106230,False,True,0.9282787454386976
505
+ 10.31234/osf.io/j2bzc,10.1016/j.jad.2022.12.162,False,True,1.0
506
+ 10.2196/preprints.20457,10.2196/20457,False,True,1.0
507
+ 10.5194/acp-2018-761,10.5194/acp-19-233-2019,True,True,0.980213089802131
508
+ 10.5194/amt-2018-258,10.5194/amt-12-955-2019,True,True,0.99860529986053
509
+ 10.20944/preprints202010.0084.v2,10.3390/cancers12113327,False,True,1.0
510
+ 10.20944/preprints202010.0084.v1,10.3390/cancers12113327,False,True,1.0
511
+ 10.21203/rs.3.rs-32295/v1,10.1186/s13287-020-02000-2,False,True,1.0
512
+ 10.21203/rs.3.rs-32295/v2,10.1186/s13287-020-02000-2,False,True,1.0
513
+ 10.1101/760777,10.1007/s13205-020-2084-y,False,True,0.9902370990237098
514
+ 10.1101/591065,10.1371/journal.pgen.1008501,False,True,1.0
515
+ 10.2196/preprints.25469,10.2196/25469,False,True,1.0
516
+ 10.1101/276618,10.1186/s12885-018-4757-z,False,True,1.0
517
+ 10.1101/055863,10.1038/nmeth.4108,False,True,1.0
518
+ 10.1101/2022.02.11.479825,10.1172/jci159402,False,True,0.9957805907172996
519
+ 10.1101/549873,10.1111/oik.07213,False,True,0.9989615784008308
520
+ 10.5194/tc-2019-30,10.5194/tc-13-1709-2019,True,True,1.0
521
+ 10.20944/preprints202103.0467.v1,10.3390/rs13081581,False,True,1.0
522
+ 10.5194/amtd-7-5491-2014,10.5194/amt-8-1701-2015,True,True,0.9948717948717948
523
+ 10.1101/2020.04.28.066605,10.3389/fcell.2020.00617,False,True,1.0
524
+ 10.1101/2021.12.22.473713,10.1002/advs.202200315,False,True,0.9963369963369964
525
+ 10.21203/rs.3.rs-2240657/v1,10.1038/s41467-023-35915-4,False,True,1.0
526
+ 10.5194/acp-2021-182,10.5194/acp-21-9329-2021,True,True,1.0
527
+ 10.1101/416305,10.1021/jacs.8b10840,False,True,1.0
528
+ 10.1101/127761,10.1186/s13059-017-1218-y,False,True,1.0
529
+ 10.5194/bg-2022-101,10.5194/bg-19-4655-2022,True,True,0.996376811594203
530
+ 10.1101/2020.01.13.905471,10.1523/jneurosci.2809-19.2020,False,True,0.9659090909090908
531
+ 10.1101/857987,10.1523/jneurosci.1468-19.2020,False,True,0.89788748538998
532
+ 10.21203/rs.3.rs-38299/v1,10.1186/s13018-020-02039-0,False,True,
533
+ 10.21203/rs.3.rs-38299/v2,10.1186/s13018-020-02039-0,False,True,1.0
534
+ 10.20944/preprints202007.0501.v1,10.3390/en13174422,False,True,1.0
535
+ 10.21203/rs.3.rs-1523403/v1,10.1038/s41591-022-02202-6,False,True,0.9626833586851126
536
+ 10.26434/chemrxiv.9994940.v1,10.1021/acs.jpclett.0c00121,False,True,0.992248062015504
537
+ 10.21203/rs.3.rs-32573/v1,10.1186/s12985-020-01417-8,False,True,1.0
538
+ 10.21203/rs.3.rs-32573/v2,10.1186/s12985-020-01417-8,False,True,1.0
539
+ 10.21203/rs.3.rs-17623/v1,10.1007/s40145-020-0410-9,False,True,0.9696969696969696
540
+ 10.21203/rs.3.rs-17623/v2,10.1007/s40145-020-0410-9,False,True,0.9696969696969696
541
+ 10.5194/tcd-9-2597-2015,10.5194/tc-9-2201-2015,True,True,1.0
542
+ 10.5194/hessd-10-15771-2013,10.5194/hess-18-2287-2014,True,True,0.9458128078817736
543
+ 10.21203/rs.2.16987/v2,10.1186/s12881-020-01156-1,False,True,0.9550997150997153
544
+ 10.21203/rs.2.16987/v1,10.1186/s12881-020-01156-1,False,True,0.9550997150997153
545
+ 10.21203/rs.2.16987/v4,10.1186/s12881-020-01156-1,False,True,0.9550997150997153
546
+ 10.21203/rs.2.16987/v3,10.1186/s12881-020-01156-1,False,True,0.9743589743589745
547
+ 10.5194/acp-2021-173,10.5194/acp-21-15023-2021,True,True,0.9869061137513844
548
+ 10.20944/preprints201808.0402.v1,10.3390/s18113670,False,True,1.0
549
+ 10.26434/chemrxiv.13055873.v2,10.1021/acsomega.0c04691,False,True,1.0
550
+ 10.26434/chemrxiv.13055873,10.1021/acsomega.0c04691,False,True,1.0
551
+ 10.26434/chemrxiv.13055873.v1,10.1021/acsomega.0c04691,False,True,1.0
552
+ 10.5194/acpd-11-163-2011,10.5194/acp-11-9683-2011,True,True,0.994017094017094
553
+ 10.5194/acpd-13-2795-2013,10.5194/acp-13-8607-2013,True,False,1.0
554
+ 10.2196/preprints.16513,10.2196/16513,False,True,1.0
555
+ 10.5194/bgd-6-11035-2009,10.5194/bg-7-1443-2010,True,True,0.9955357142857144
556
+ 10.1101/2021.03.16.435577,10.1002/glia.24106,False,True,0.9912609238451936
557
+ 10.21203/rs.3.rs-65568/v2,10.1186/s13643-021-01612-w,False,True,1.0
558
+ 10.21203/rs.3.rs-65568/v1,10.1186/s13643-021-01612-w,False,True,1.0
559
+ 10.21034/wp.742,10.1257/mac.20170367,False,True,1.0
560
+ 10.26434/chemrxiv-2021-k4v9r,10.1021/jacs.1c09321,False,True,1.0
561
+ 10.1101/2021.06.10.447962,10.1002/smll.202103552,False,True,0.971118761485915
562
+ 10.21203/rs.2.13144/v4,10.1186/s12879-019-4618-7,False,True,1.0
563
+ 10.21203/rs.2.13144/v3,10.1186/s12879-019-4618-7,False,True,1.0
564
+ 10.21203/rs.2.13144/v2,10.1186/s12879-019-4618-7,False,True,1.0
565
+ 10.21203/rs.2.13144/v1,10.1186/s12879-019-4618-7,False,True,1.0
566
+ 10.26434/chemrxiv.12369758.v1,10.1021/acschembio.0c00426,False,True,0.903858024691358
567
+ 10.26434/chemrxiv.12369758,10.1021/acschembio.0c00426,False,True,0.903858024691358
568
+ 10.5194/bgd-5-3157-2008,10.5194/bg-6-405-2009,True,True,1.0
569
+ 10.1101/2020.05.03.20089383,10.1016/j.bbi.2020.08.021,False,True,1.0
570
+ 10.21203/rs.3.rs-1293101/v1,10.1038/s41562-023-01540-w,False,True,0.8857142857142857
571
+ 10.5194/gmd-2018-20,10.5194/gmd-11-2813-2018,True,True,1.0
572
+ 10.1101/2020.08.12.248005,10.7554/elife.57436,True,True,0.9803921568627452
573
+ 10.1101/173146,10.1099/mgen.0.000166,False,True,1.0
574
+ 10.2196/preprints.23254,10.2196/23254,False,True,1.0
575
+ 10.5194/gmd-2017-263,10.5194/gmd-11-3187-2018,True,True,1.0
576
+ 10.2196/preprints.24006,10.2196/24006,False,True,1.0
577
+ 10.1101/2021.02.08.21251234,10.1093/ajcn/nqab276,False,True,1.0
578
+ 10.5194/acp-2016-770,10.5194/acp-17-7067-2017,True,True,1.0
579
+ 10.5194/cpd-11-3187-2015,10.5194/cp-12-91-2016,True,True,1.0
580
+ 10.2196/preprints.14369,10.2196/14369,False,True,1.0
581
+ 10.5194/mr-2023-2,10.5194/mr-4-153-2023,True,True,0.9080459770114944
582
+ 10.2196/preprints.19018,10.2196/19018,False,True,1.0
583
+ 10.5194/gmd-2017-206,10.5194/gmd-11-2975-2018,True,True,0.9471620227038184
584
+ 10.1101/481507,10.1096/fj.201902811rr,False,True,0.9894179894179894
585
+ 10.21203/rs.3.rs-57499/v1,10.1186/s10020-020-00230-x,False,True,0.9711286089238844
586
+ 10.21203/rs.3.rs-57499/v2,10.1186/s10020-020-00230-x,False,True,1.0
587
+ 10.1101/151522,10.1016/j.jneumeth.2017.08.033,False,True,1.0
588
+ 10.5194/acpd-7-6767-2007,10.5194/acp-7-4553-2007,True,True,1.0
589
+ 10.26434/chemrxiv.7322330.v1,10.1021/jacs.8b13127,False,True,0.92018779342723
590
+ 10.1101/2020.02.19.955609,10.1177/2331216520964068,False,True,1.0
591
+ 10.5194/amt-2018-209,10.5194/amt-12-977-2019,True,True,1.0
592
+ 10.1101/2020.03.09.20033423,10.1002/jia2.25546,False,True,0.9957624290957624
593
+ 10.1101/342592,10.1186/s12864-018-5032-z,False,True,0.985685071574642
594
+ 10.31234/osf.io/qp4ev,10.1016/j.beth.2019.09.005,False,True,1.0
595
+ 10.21203/rs.2.12994/v1,10.1186/s12879-020-4873-7,False,True,0.9984567901234568
596
+ 10.21203/rs.2.12994/v2,10.1186/s12879-020-4873-7,False,True,0.9984567901234568
597
+ 10.21203/rs.2.12994/v3,10.1186/s12879-020-4873-7,False,True,0.9984567901234568
598
+ 10.21203/rs.2.12994/v4,10.1186/s12879-020-4873-7,False,True,0.9984567901234568
599
+ 10.20944/preprints202011.0543.v1,10.3390/pathogens9121037,False,True,0.9727626459143968
600
+ 10.21203/rs.3.rs-76084/v1,10.1186/s12960-021-00558-3,False,True,1.0
601
+ 10.1101/776237,10.7554/elife.70469,True,True,1.0
602
+ 10.1101/2021.10.03.462935,10.7554/elife.74183,True,True,0.9227481919789612
603
+ 10.5194/tc-2016-250,10.5194/tc-11-949-2017,True,True,1.0
604
+ 10.1101/192245,10.1162/jocn_a_01200,False,True,1.0
605
+ 10.2196/preprints.15960,10.2196/15960,False,True,1.0
606
+ 10.2196/preprints.22795,10.2196/22795,False,True,1.0
607
+ 10.5194/hessd-8-8291-2011,10.5194/hess-16-167-2012,True,True,1.0
608
+ 10.5194/gmd-2016-87,10.5194/gmd-9-3655-2016,True,True,0.9993412384716732
609
+ 10.1101/2020.02.17.952895,10.3389/fcimb.2020.00405,False,True,
610
+ 10.1101/2021.02.15.21251449,10.1093/ajcn/nqab279,False,True,1.0
611
+ 10.5194/essd-2020-280,10.5194/essd-13-2995-2021,True,True,0.98989898989899
612
+ 10.21203/rs.3.rs-23615/v3,10.1186/s12985-020-01451-6,False,True,1.0
613
+ 10.21203/rs.3.rs-23615/v2,10.1186/s12985-020-01451-6,False,True,1.0
614
+ 10.21203/rs.3.rs-23615/v1,10.1186/s12985-020-01451-6,False,True,1.0
615
+ 10.5194/gmd-2018-123,10.5194/gmd-11-4843-2018,True,True,0.9679545950665794
616
+ 10.2196/preprints.43101,10.2196/43101,False,True,1.0
617
+ 10.5194/angeo-2019-65,10.5194/angeo-37-689-2019,True,True,1.0
618
+ 10.5194/os-2020-66,10.5194/os-17-59-2021,True,True,1.0
619
+ 10.5194/acp-2022-410,10.5194/acp-23-1963-2023,True,True,0.9946524064171124
620
+ 10.21203/rs.3.rs-42553/v2,10.1186/s13756-020-00864-w,False,True,1.0
621
+ 10.21203/rs.3.rs-42553/v1,10.1186/s13756-020-00864-w,False,True,0.9852941176470588
622
+ 10.20944/preprints201905.0040.v1,10.3390/ijms20112780,False,True,1.0
623
+ 10.1101/2020.03.17.20037515,10.1093/cid/ciaa443,False,True,0.9732868757259
624
+ 10.20944/preprints202002.0288.v1,10.3390/brainsci10030143,False,True,0.9984848484848484
625
+ 10.20944/preprints202007.0130.v1,10.3390/biomedicines8080275,False,True,1.0
626
+ 10.1101/128645,10.1002/hbm.23843,False,True,0.987962962962963
627
+ 10.1101/419994,10.1111/jfb.13989,False,True,
628
+ 10.1101/2020.03.20.000000,10.1096/fj.202001281rr,False,True,0.9926091269841272
629
+ 10.26434/chemrxiv-2022-c1ctc-v2,10.1039/d2sc05997e,False,True,1.0
630
+ 10.2196/preprints.12664,10.2196/12664,False,True,1.0
631
+ 10.5194/bg-2018-477,10.5194/bg-16-2147-2019,True,True,1.0
632
+ 10.21034/wp.75,10.2307/1991332,False,True,1.0
633
+ 10.1101/2021.12.13.472383,10.7554/elife.78092,True,True,1.0
634
+ 10.1101/105874,10.1016/j.neuroimage.2017.04.063,False,True,1.0
635
+ 10.1101/2022.10.25.513707,10.7554/elife.83908,True,False,0.9799631120053656
636
+ 10.26434/chemrxiv.12587537,10.1021/acs.jctc.0c00715,False,True,1.0
637
+ 10.26434/chemrxiv.12587537.v1,10.1021/acs.jctc.0c00715,False,True,1.0
638
+ 10.1101/378497,10.1093/nar/gkz169,False,True,1.0
639
+ 10.31223/osf.io/3mjc2,10.1016/j.precamres.2020.105849,False,True,0.9765684051398336
640
+ 10.1101/439687,10.1002/2211-5463.12744,False,True,0.987987987987988
641
+ 10.26434/chemrxiv-2023-6tgkh,10.1021/acs.jcim.3c00732,False,True,1.0
642
+ 10.5194/gmd-2020-179,10.5194/gmd-13-6077-2020,True,True,0.9718076285240466
643
+ 10.2196/preprints.22564,10.2196/22564,False,True,1.0
644
+ 10.1101/617019,10.3390/genes10060468,False,True,0.8909691867586605
645
+ 10.1101/068346,10.1371/journal.pcbi.1005260,False,True,0.8888888888888888
646
+ 10.1101/2020.08.15.252494,10.1161/atvbaha.120.315556,False,True,0.915073340051506
647
+ 10.1101/612010,10.3390/su11102787,False,True,0.9824561403508772
648
+ 10.1101/2020.03.28.013672,10.1016/j.devcel.2020.05.012,False,True,0.9743589743589745
649
+ 10.1101/122945,10.1162/netn_a_00031,False,True,1.0
650
+ 10.1101/2020.12.16.423042,10.1007/s10334-022-01033-3,False,True,1.0
651
+ 10.1101/2021.07.20.453033,10.7554/elife.73348,True,True,1.0
652
+ 10.1101/2021.03.22.21254119,10.1021/acs.jproteome.1c00326,False,True,1.0
653
+ 10.1101/208223,10.1038/s41380-018-0023-7,False,True,1.0
654
+ 10.1101/354829,10.1002/hbm.24788,False,True,0.9956140350877192
655
+ 10.1101/501221,10.1523/jneurosci.0601-21.2021,False,True,1.0
656
+ 10.31219/osf.io/y6mkh,10.1111/nous.12265,False,True,
657
+ 10.5194/bgd-11-7615-2014,10.5194/bg-11-6323-2014,True,True,1.0
658
+ 10.1101/641159,10.1038/s41593-021-00821-9,False,True,1.0
659
+ 10.5194/hessd-11-1343-2014,10.5194/hess-18-3259-2014,True,True,0.9955555555555556
660
+ 10.26434/chemrxiv-2021-rg4wj-v2,10.1016/j.jcis.2022.07.164,False,True,0.983974358974359
661
+ 10.21203/rs.3.rs-515297/v1,10.1016/j.bbamem.2021.183794,False,True,0.9827586206896552
662
+ 10.1101/2020.05.06.081562,10.1021/acs.jpcb.0c04139,False,True,1.0
663
+ 10.1101/2020.02.17.952457,10.1371/journal.pbio.3000687,False,True,0.9978213507625272
664
+ 10.21203/rs.3.rs-31943/v4,10.1186/s12876-020-01553-z,False,True,1.0
665
+ 10.21203/rs.3.rs-31943/v3,10.1186/s12876-020-01553-z,False,True,1.0
666
+ 10.21203/rs.3.rs-31943/v2,10.1186/s12876-020-01553-z,False,True,1.0
667
+ 10.21203/rs.3.rs-31943/v1,10.1186/s12876-020-01553-z,False,True,0.9862258953168044
668
+ 10.1101/067876,10.15252/embj.201696038,False,True,0.8719135802469135
669
+ 10.1101/2020.06.24.169334,10.1021/acscentsci.1c01293,False,True,0.9753872555660932
670
+ 10.1101/138834,10.3389/fncel.2017.00214,False,True,1.0
671
+ 10.21203/rs.3.rs-70874/v1,10.1016/j.wasman.2022.01.022,False,True,0.9464007899260638
672
+ 10.5194/essd-2017-134,10.5194/essd-10-985-2018,True,True,1.0
673
+ 10.5194/hess-2021-68,10.5194/hess-25-6495-2021,True,True,0.9129097148266476
674
+ 10.20944/preprints201612.0138.v1,10.3390/ma10030297,False,True,1.0
675
+ 10.1101/505032,10.1038/s41592-020-01023-0,False,True,0.996078431372549
676
+ 10.1101/671230,10.1038/s41556-020-0485-0,False,True,1.0
677
+ 10.21203/rs.3.rs-66113/v1,10.1186/s13287-020-02056-0,False,True,0.9678362573099416
678
+ 10.21203/rs.3.rs-66113/v2,10.1186/s13287-020-02056-0,False,True,1.0
679
+ 10.21203/rs.3.rs-66113/v3,10.1186/s13287-020-02056-0,False,True,1.0
680
+ 10.1101/038117,10.1038/nmeth.3991,False,True,0.9095238095238096
681
+ 10.20944/preprints201808.0322.v1,10.3390/molecules23102549,False,True,1.0
682
+ 10.1101/2020.07.30.228924,10.1038/s41592-021-01136-0,False,True,1.0
683
+ 10.21203/rs.3.rs-904665/v1,10.1038/s41556-022-00953-5,False,True,0.9195027195027196
684
+ 10.2196/preprints.22488,10.2196/22488,False,True,1.0
685
+ 10.1101/2022.03.04.483005,10.7554/elife.78385,True,False,0.9462465245597776
686
+ 10.1101/848846,10.1182/blood.2020004801,False,True,0.9846216768916156
687
+ 10.2196/preprints.10755,10.2196/10755,False,True,1.0
688
+ 10.1101/057976,10.1093/bioinformatics/btw390,False,True,0.9118723052546582
689
+ 10.5194/tc-2018-175,10.5194/tc-13-895-2019,True,True,0.9646464646464646
690
+ 10.5194/esurf-2020-59,10.5194/esurf-9-1153-2021,True,True,0.9936507936507936
691
+ 10.5194/cp-2017-151,10.5194/cp-14-1079-2018,True,True,1.0
692
+ 10.1101/2020.04.07.029140,10.1371/journal.ppat.1008530,False,True,1.0
693
+ 10.2196/preprints.26309,10.2196/26309,False,True,1.0
694
+ 10.1101/2021.10.14.464354,10.7554/elife.74565,True,True,1.0
695
+ 10.1101/860874,10.1088/1741-2552/ab9dba,False,True,1.0
696
+ 10.1101/207076,10.1167/18.6.10,False,True,1.0
697
+ 10.1101/376863,10.1152/jn.00680.2018,False,True,0.9885129490392648
698
+ 10.5194/bg-2019-482,10.5194/bg-17-4247-2020,True,True,0.9792843691148776
699
+ 10.2196/preprints.13802,10.2196/13802,False,True,1.0
700
+ 10.2196/preprints.17740,10.2196/17740,False,True,1.0
701
+ 10.2196/preprints.11334,10.2196/11334,False,True,1.0
702
+ 10.31235/osf.io/hfr96,10.1038/nclimate3271,False,True,0.9826839826839828
703
+ 10.1101/2021.04.30.442171,10.1523/jneurosci.1575-21.2022,False,True,1.0
704
+ 10.1101/228668,10.1099/mgen.0.000165,False,True,0.9875222816399286
705
+ 10.5194/nhess-2017-152,10.5194/nhess-17-2199-2017,True,True,0.9588652482269504
706
+ 10.5194/amt-2019-282,10.5194/amt-13-323-2020,True,True,0.9693251533742332
707
+ 10.5194/acpd-12-24847-2012,10.5194/acp-13-3345-2013,True,True,1.0
708
+ 10.5194/hessd-9-9809-2012,10.5194/hess-17-3127-2013,True,True,0.9061032863849764
709
+ 10.5194/bgd-11-14699-2014,10.5194/bg-12-863-2015,True,True,0.9968253968253968
710
+ 10.2196/preprints.24851,10.2196/24851,False,True,1.0
711
+ 10.1101/263939,10.1038/nbt.4266,False,True,
712
+ 10.1101/2020.05.05.078196,10.1002/brb3.1786,False,True,0.9953703703703703
713
+ 10.1101/191809,10.1002/pld3.47,False,True,0.96640826873385
714
+ 10.26434/chemrxiv.10003412.v1,10.1021/acs.jctc.9b01066,False,True,1.0
715
+ 10.20944/preprints201706.0002.v1,10.3390/e19060286,False,True,1.0
716
+ 10.31235/osf.io/fw4er,10.1093/aje/kwy218,False,True,1.0
717
+ 10.1101/191494,10.1111/ejn.13816,False,True,1.0
718
+ 10.5194/acp-2021-784,10.5194/acp-22-1951-2022,True,True,0.9819277108433736
719
+ 10.1101/2020.06.29.20143180,10.1371/journal.pone.0242758,False,True,1.0
720
+ 10.2196/preprints.25456,10.2196/25456,False,True,1.0
721
+ 10.5194/cpd-9-3239-2013,10.5194/cp-10-487-2014,True,True,0.9954954954954954
722
+ 10.1002/essoar.10510350.1,10.1029/2022gl098158,False,True,0.9988505747126436
723
+ 10.1101/844712,10.1371/journal.pone.0237189,False,True,1.0
724
+ 10.5194/se-2017-35,10.5194/se-8-789-2017,True,True,0.9942726231386024
725
+ 10.1101/111070,10.1037/xlm0000518,False,True,0.9186480453521352
726
+ 10.5194/se-2021-6,10.5194/se-12-2523-2021,True,True,0.9875311720698252
727
+ 10.1101/158113,10.1371/journal.pone.0214311,False,True,1.0
728
+ 10.1101/2021.05.06.21256789,10.1213/ane.0000000000005730,False,True,0.9975490196078431
729
+ 10.1101/867168,10.1016/j.jneumeth.2020.108756,False,True,0.875
730
+ 10.1101/661207,10.3389/fmicb.2019.02558,False,True,1.0
731
+ 10.1101/391243,10.1534/g3.118.200662,False,True,0.9444444444444444
732
+ 10.2196/preprints.44548,10.2196/44548,False,True,1.0
733
+ 10.1101/101535,10.1098/rsos.171308,False,True,0.9969418960244648
734
+ 10.5194/bg-2018-430,10.5194/bg-16-1225-2019,True,True,1.0
735
+ 10.1101/534206,10.1093/nar/gkz306,False,True,0.9427618157089428
736
+ 10.2196/preprints.18662,10.2196/18662,False,True,1.0
737
+ 10.5194/nhess-2021-31,10.5194/nhess-21-1759-2021,True,True,1.0
738
+ 10.21203/rs.3.rs-60829/v1,10.1186/s13046-020-01796-4,False,True,0.9891156462585036
739
+ 10.21203/rs.3.rs-60829/v2,10.1186/s13046-020-01796-4,False,True,0.9986772486772488
740
+ 10.1101/2020.06.12.20127944,10.1001/jama.2020.15580,False,True,0.978494623655914
741
+ 10.20944/preprints201801.0107.v1,10.3390/nu10020238,False,True,1.0
742
+ 10.5194/hess-2016-351,10.5194/hess-21-1741-2017,True,True,0.9983579638752051
743
+ 10.5194/essd-2020-303,10.5194/essd-13-3337-2021,True,True,1.0
744
+ 10.1101/2021.10.07.463355,10.1021/acssensors.1c02201,False,True,1.0
745
+ 10.5194/hess-2021-506,10.5194/hess-26-2899-2022,True,True,0.912
746
+ 10.1101/219113,10.1016/j.sbi.2018.01.009,False,True,1.0
747
+ 10.1101/632810,10.3390/cancers12061568,False,True,0.9866666666666668
748
+ 10.2196/preprints.41446,10.2196/41446,False,True,1.0
749
+ 10.21203/rs.3.rs-2209582/v1,10.1007/s13146-023-00880-y,False,True,1.0
750
+ 10.21203/rs.3.rs-2440941/v1,10.1007/s13146-023-00882-w,False,True,1.0
751
+ 10.21203/rs.3.rs-2597108/v1,10.1007/s10238-023-01049-6,False,True,0.9743589743589745
752
+ 10.1101/2020.05.27.119438,10.1371/journal.pcbi.1008625,False,True,0.9843400447427294
753
+ 10.20944/preprints201902.0019.v1,10.3390/a12030060,False,True,1.0
754
+ 10.1101/816694,10.1182/bloodadvances.2019001393,False,True,0.9716981132075472
755
+ 10.5194/acp-2016-430,10.5194/acp-17-11041-2017,True,True,0.9962546816479402
756
+ 10.31234/osf.io/hv28a,10.1037/pspa0000098,False,True,1.0
757
+ 10.20944/preprints201612.0042.v1,10.3390/ijms18020347,False,True,0.9297052154195012
758
+ 10.1101/2019.12.15.876847,10.1093/sleep/zsaa111,False,True,1.0
759
+ 10.5194/acpd-14-25533-2014,10.5194/acp-15-4179-2015,True,True,1.0
760
+ 10.5194/acp-2016-308,10.5194/acp-16-12397-2016,True,True,1.0
761
+ 10.1101/235176,10.1038/s41592-018-0171-3,False,True,0.9691282491742363
762
+ 10.21034/wp.730,10.1257/aer.20121524,False,True,1.0
763
+ 10.1101/636803,10.1002/ece3.6313,False,True,0.9895833333333334
764
+ 10.5194/cpd-5-1367-2009,10.5194/cp-5-585-2009,True,True,0.989010989010989
765
+ 10.36227/techrxiv.21758660,10.1109/tim.2023.3256468,False,True,1.0
766
+ 10.36227/techrxiv.21758660.v1,10.1109/tim.2023.3256468,False,True,1.0
767
+ 10.31234/osf.io/y27vc,10.1080/13548506.2017.1385818,False,True,1.0
768
+ 10.5194/cp-2017-26,10.5194/cp-13-1007-2017,True,True,1.0
769
+ 10.5194/acpd-15-12007-2015,10.5194/acp-15-11861-2015,True,True,1.0
770
+ 10.5194/wes-2018-49,10.5194/wes-3-845-2018,True,True,1.0
771
+ 10.31219/osf.io/cv2bn,10.3758/s13428-018-1035-6,False,True,1.0
772
+ 10.1101/803346,10.1016/j.ajhg.2020.06.010,False,True,0.9377207977207976
773
+ 10.36227/techrxiv.21674759.v1,10.1109/ojcoms.2023.3282814,False,True,1.0
774
+ 10.36227/techrxiv.21674759,10.1109/ojcoms.2023.3282814,False,True,1.0
775
+ 10.31219/osf.io/bwm4k,10.3390/ma14051106,False,True,1.0
776
+ 10.1101/084418,10.15252/msb.20188497,False,True,1.0
777
+ 10.5194/egusphere-2022-180,10.5194/se-13-1755-2022,True,True,0.9826224328593997
778
+ 10.1101/537001,10.1016/j.cell.2019.07.038,False,True,
779
+ 10.21203/rs.3.rs-136528/v1,10.1186/s13287-021-02223-x,False,True,0.9743589743589745
780
+ 10.1101/640557,10.1111/oik.06957,False,True,0.9883190883190884
781
+ 10.21034/sr.361,10.1257/mac.1.1.146,False,True,
782
+ 10.26434/chemrxiv.8289812.v1,10.1021/acsnano.9b06019,False,True,0.9314194577352471
783
+ 10.5194/amtd-2-489-2009,10.5194/amt-2-379-2009,True,True,1.0
784
+ 10.1101/2020.08.27.267880,10.1186/s13229-022-00511-8,False,True,0.9561904761904764
785
+ 10.5194/acp-2019-1026,10.5194/acp-20-8727-2020,True,True,0.983606557377049
786
+ 10.5194/se-2019-49,10.5194/se-10-987-2019,True,True,1.0
787
+ 10.5194/acp-2020-263,10.5194/acp-21-1697-2021,True,True,0.9936073059360732
788
+ 10.21034/sr.186,10.1007/bf01213946,False,True,0.974910394265233
789
+ 10.5194/acpd-8-8009-2008,10.5194/acp-8-6169-2008,True,True,1.0
790
+ 10.5194/acpd-7-10799-2007,10.5194/acp-8-901-2008,True,True,1.0
791
+ 10.5194/acpd-11-8665-2011,10.5194/acp-11-6207-2011,True,True,0.9209742194584792
792
+ 10.5194/acpd-6-9003-2006,10.5194/acp-7-685-2007,True,True,1.0
793
+ 10.5194/acpd-14-19791-2014,10.5194/acp-15-253-2015,True,True,1.0
794
+ 10.5194/acpd-4-4545-2004,10.5194/acp-4-2227-2004,True,True,1.0
795
+ 10.1101/863621,10.21105/joss.01994,False,True,0.927811176648518
796
+ 10.5194/acpd-4-2569-2004,10.5194/acp-4-1895-2004,True,True,1.0
797
+ 10.2196/preprints.9633,10.2196/resprot.9633,False,True,1.0
798
+ 10.2196/preprints.19601,10.2196/19601,False,True,1.0
799
+ 10.5194/bg-2019-237,10.5194/bg-17-215-2020,True,True,0.9548387096774192
800
+ 10.20944/preprints202103.0379.v1,10.3390/genes12040544,False,True,1.0
801
+ 10.21203/rs.3.rs-72276/v1,10.1186/s13643-021-01652-2,False,True,0.978593272171254
802
+ 10.5194/acpd-13-20677-2013,10.5194/acp-14-1423-2014,True,True,1.0
803
+ 10.5194/acpd-10-10219-2010,10.5194/acp-10-7169-2010,True,True,1.0
804
+ 10.1101/2021.09.06.21263001,10.1111/nmo.14331,False,True,0.942927545452176
805
+ 10.1101/230938,10.1016/j.yjmcc.2018.06.007,False,True,0.9944444444444444
806
+ 10.5194/gchron-2020-11,10.5194/gchron-3-181-2021,True,True,1.0
807
+ 10.5194/hessd-8-4459-2011,10.5194/hess-15-2581-2011,True,True,1.0
808
+ 10.1101/2020.04.05.026005,10.1016/j.molliq.2020.113612,False,True,0.9987029831387808
809
+ 10.1101/2022.12.07.519455,10.7554/elife.85069,True,False,
810
+ 10.20944/preprints202003.0433.v1,10.1016/j.micpath.2020.104236,False,True,1.0
811
+ 10.5194/acp-2016-332,10.5194/acp-16-13185-2016,True,True,1.0
812
+ 10.5194/amt-2020-257,10.5194/amt-14-945-2021,True,True,0.9913644214162348
813
+ 10.20944/preprints202010.0453.v1,10.3390/ani10122196,False,True,0.9775910364145658
814
+ 10.20944/preprints202010.0453.v2,10.3390/ani10122196,False,True,0.9716981132075472
815
+ 10.21034/wp.741,10.1257/aer.20181499,False,True,1.0
816
+ 10.1101/2020.04.25.20079996,10.3389/fpsyg.2020.551004,False,True,0.9986504723346828
817
+ 10.1101/443127,10.1007/s00415-019-09340-x,False,True,1.0
818
+ 10.5194/essd-2022-16,10.5194/essd-14-3743-2022,True,True,1.0
819
+ 10.5194/soil-2017-28,10.5194/soil-4-37-2018,True,True,0.9715242881072026
820
+ 10.5194/hess-2016-323,10.5194/hess-21-1149-2017,True,True,0.9643605870020964
821
+ 10.1101/511683,10.1186/s40168-019-0665-y,False,True,0.9893444246385422
822
+ 10.1101/088666,10.7717/peerj.3889,False,True,
823
+ 10.1101/2021.03.28.21254404,10.1021/acs.estlett.1c00375,False,True,0.959078814570144
824
+ 10.5194/gmd-2017-103,10.5194/gmd-11-257-2018,True,True,0.9954415954415956
825
+ 10.5194/acpd-8-21229-2008,10.5194/acp-9-5905-2009,True,True,0.9964912280701754
826
+ 10.5194/acpd-4-3699-2004,10.5194/acp-4-2337-2004,True,True,0.9767441860465116
827
+ 10.20944/preprints202010.0447.v1,10.3390/cancers12123524,False,True,1.0
828
+ 10.5194/bgd-10-19509-2013,10.5194/bg-11-2069-2014,True,True,0.9797979797979798
829
+ 10.5194/acpd-13-18345-2013,10.5194/acp-13-12271-2013,True,True,0.9987515605493132
830
+ 10.1101/296061,10.1016/j.dcn.2018.09.003,False,True,1.0
831
+ 10.21034/sr.249,10.1080/07474939908800428,False,True,0.988155668358714
832
+ 10.21203/rs.3.rs-93388/v1,10.1186/s12872-020-01827-0,False,True,1.0
833
+ 10.2196/preprints.8954,10.2196/jmir.8954,False,True,1.0
834
+ 10.5194/bgd-7-3335-2010,10.5194/bg-7-2613-2010,True,True,0.9688888888888888
835
+ 10.1002/essoar.10506462.1,10.1029/2021jc017734,False,True,0.9967320261437908
836
+ 10.5194/acpd-2-2209-2002,10.5194/acp-3-417-2003,True,True,0.9904761904761904
837
+ 10.5194/acpd-8-18727-2008,10.5194/acp-9-5489-2009,True,True,1.0
838
+ 10.5194/tc-2020-164,10.5194/tc-15-1097-2021,True,True,1.0
839
+ 10.5194/acpd-13-10621-2013,10.5194/acp-14-765-2014,True,True,0.9565217391304348
840
+ 10.21203/rs.3.rs-507826/v1,10.1007/s10924-021-02297-x,False,True,0.9696969696969696
841
+ 10.5194/acp-2018-209,10.5194/acp-18-12207-2018,True,True,0.9986504723346828
842
+ 10.1101/2021.04.19.440546,10.1038/s42003-021-02874-9,False,True,1.0
843
+ 10.5194/acpd-5-509-2005,10.5194/acp-5-1557-2005,True,True,1.0
844
+ 10.26434/chemrxiv.13513731.v2,10.1021/acs.jpca.1c02872,False,True,
845
+ 10.5194/acpd-6-3135-2006,10.5194/acp-6-3377-2006,True,True,1.0
846
+ 10.5194/bgd-11-7991-2014,10.5194/bg-11-6173-2014,True,True,0.9916161616161616
847
+ 10.5194/gchron-2019-3,10.5194/gchron-1-17-2019,True,True,0.9760765550239232
848
+ 10.1101/430447,10.15252/embj.2019103667,False,True,
849
+ 10.5194/acpd-9-16549-2009,10.5194/acp-10-431-2010,True,True,1.0
850
+ 10.5194/bg-2016-101,10.5194/bg-13-4491-2016,True,True,0.9313034188034188
851
+ 10.1101/2020.05.17.100255,10.1016/j.cortex.2020.09.004,False,True,1.0
852
+ 10.5194/acpd-3-5139-2003,10.5194/acp-4-391-2004,True,True,1.0
853
+ 10.5194/bg-2016-357,10.5194/bg-14-2781-2017,True,True,1.0
854
+ 10.5194/acp-2020-91,10.5194/acp-20-8641-2020,True,True,0.9826224328593997
855
+ 10.5194/hess-2019-600,10.5194/hess-24-4413-2020,True,True,
856
+ 10.1101/476960,10.15252/embj.2018101153,False,True,0.973765903307888
857
+ 10.5194/cpd-10-3327-2014,10.5194/cp-11-327-2015,True,True,1.0
858
+ 10.5194/acpd-10-12713-2010,10.5194/acp-10-9039-2010,True,True,0.9947916666666666
859
+ 10.5194/acpd-9-24587-2009,10.5194/acp-10-5573-2010,True,True,0.9382716049382716
860
+ 10.5194/tc-2016-161,10.5194/tc-10-2981-2016,True,True,1.0
861
+ 10.26434/chemrxiv-2022-kgxfk-v2,10.1016/j.eml.2022.101929,False,True,1.0
862
+ 10.26434/chemrxiv.8061650.v1,10.1021/acs.chemmater.9b03267,False,True,0.9822281959378736
863
+ 10.5194/tc-2019-293,10.5194/tc-14-2775-2020,True,True,1.0
864
+ 10.5194/tcd-2-111-2008,10.5194/tc-2-95-2008,True,True,0.9919678714859438
865
+ 10.26434/chemrxiv.7322183.v1,10.1021/acs.jctc.8b01041,False,True,1.0
866
+ 10.21203/rs.3.rs-2241246/v1,10.1016/j.resconrec.2023.106873,False,True,1.0
867
+ 10.26434/chemrxiv.7851587.v1,10.1021/acs.jpcc.8b11092,False,True,1.0
868
+ 10.20944/preprints201609.0095.v1,10.1007/s11356-016-8321-6,False,True,0.9662618083670717
869
+ 10.5194/bg-2016-172,10.5194/bg-14-597-2017,True,True,1.0
870
+ 10.20944/preprints202006.0275.v1,10.3855/jidc.13692,False,True,0.9977324263038548
871
+ 10.5194/tc-2021-382,10.5194/tc-16-3313-2022,True,True,1.0
872
+ 10.5194/sed-5-789-2013,10.5194/se-4-373-2013,True,True,0.9851380042462846
873
+ 10.2196/preprints.19159,10.2196/19159,False,True,1.0
874
+ 10.5194/amt-2021-90,10.5194/amt-14-5625-2021,True,True,1.0
875
+ 10.21203/rs.3.rs-206773/v1,10.1186/s43058-021-00128-7,False,True,1.0
876
+ 10.2196/preprints.10078,10.2196/10078,False,True,1.0
877
+ 10.1101/2020.07.06.190314,10.3390/metabo10120488,False,True,1.0
878
+ 10.5194/essd-2018-3,10.5194/essd-10-1427-2018,True,True,1.0
879
+ 10.7287/peerj.preprints.2795v1,10.7717/peerj.3500,False,True,1.0
880
+ 10.31227/osf.io/kxdf6,10.22216/jen.v2i3.2357,False,True,1.0
881
+ 10.2196/preprints.40038,10.2196/40038,False,True,1.0
882
+ 10.1101/392761,10.1371/journal.pone.0223183,False,True,1.0
883
+ 10.2196/preprints.39264,10.2196/39264,False,True,1.0
884
+ 10.2196/preprints.33793,10.2196/33793,False,True,1.0
885
+ 10.21203/rs.2.12491/v2,10.1186/s13063-019-3833-2,False,True,0.9719974309569684
886
+ 10.21203/rs.2.12491/v1,10.1186/s13063-019-3833-2,False,True,0.9645951035781544
887
+ 10.21203/rs.3.rs-61509/v1,10.1186/s12944-020-01428-y,False,True,1.0
888
+ 10.21203/rs.3.rs-61509/v4,10.1186/s12944-020-01428-y,False,True,1.0
889
+ 10.21203/rs.3.rs-61509/v5,10.1186/s12944-020-01428-y,False,True,1.0
890
+ 10.21203/rs.3.rs-61509/v2,10.1186/s12944-020-01428-y,False,True,1.0
891
+ 10.21203/rs.3.rs-61509/v3,10.1186/s12944-020-01428-y,False,True,1.0
892
+ 10.5194/acpd-11-24813-2011,10.5194/acp-12-5429-2012,True,True,0.9028871391076116
893
+ 10.5194/amtd-4-3055-2011,10.5194/amt-4-1593-2011,True,True,1.0
894
+ 10.21203/rs.3.rs-58058/v2,10.1186/s12909-021-02570-6,False,True,0.9968253968253968
895
+ 10.21203/rs.3.rs-58058/v1,10.1186/s12909-021-02570-6,False,True,1.0
896
+ 10.20944/preprints201802.0069.v1,10.3390/f9030100,False,True,0.9824561403508772
897
+ 10.21203/rs.2.15987/v1,10.1186/s12887-019-1863-2,False,True,1.0
898
+ 10.21203/rs.2.15987/v2,10.1186/s12887-019-1863-2,False,True,1.0
899
+ 10.21203/rs.2.15987/v3,10.1186/s12887-019-1863-2,False,True,1.0
900
+ 10.2196/preprints.12797,10.2196/12797,False,True,1.0
901
+ 10.20944/preprints201907.0118.v1,10.3390/ijerph16162815,False,True,1.0
902
+ 10.2196/preprints.11824,10.2196/11824,False,True,1.0
903
+ 10.5194/hess-2020-46,10.5194/hess-24-5015-2020,True,True,1.0
904
+ 10.31219/osf.io/w9unj,10.32520/jtp.v8i2.941,False,True,
905
+ 10.20944/preprints201908.0123.v1,10.15517/rbt.v68i1.38555,False,True,0.925004016451385
906
+ 10.5194/amtd-5-8579-2012,10.5194/amt-6-1359-2013,True,True,0.9941520467836256
907
+ 10.5194/acpd-15-19045-2015,10.5194/acp-16-7681-2016,True,True,0.9204142368936375
908
+ 10.20944/preprints201804.0244.v1,10.3390/min8050192,False,True,0.9482758620689654
909
+ 10.31235/osf.io/7t6w3,10.18523/kmlpj153255.2018-4.99-118,False,True,1.0
910
+ 10.5194/essd-2021-239,10.5194/essd-14-3915-2022,True,True,0.996078431372549
911
+ 10.5194/acp-2019-639,10.5194/acp-20-4445-2020,True,True,1.0
912
+ 10.2196/preprints.14501,10.2196/14501,False,True,1.0
913
+ 10.21203/rs.3.rs-132353/v1,10.1186/s13018-020-02191-7,False,True,0.992248062015504
914
+ 10.2196/preprints.10665,10.2196/10665,False,True,1.0
915
+ 10.5194/gmd-2021-395,10.5194/gmd-15-7557-2022,True,True,0.9958333333333332
916
+ 10.5194/acpd-10-10969-2010,10.5194/acp-10-8669-2010,True,False,1.0
917
+ 10.5194/amt-2019-481,10.5194/amt-13-3661-2020,True,True,1.0
918
+ 10.5194/wes-2020-51,10.5194/wes-5-855-2020,True,True,1.0
919
+ 10.5194/npg-2020-4,10.5194/npg-27-391-2020,True,True,1.0
920
+ 10.1101/2020.01.09.900050,10.1371/journal.pone.0228121,False,True,1.0
921
+ 10.5194/nhess-2016-46,10.5194/nhess-16-1807-2016,True,True,1.0
922
+ 10.31219/osf.io/j67kq,10.31014/aior.1991.03.04.241,False,True,
923
+ 10.31235/osf.io/wsh64,10.31014/aior.1991.03.04.241,False,True,
924
+ 10.5194/acp-2021-207,10.5194/acp-21-13119-2021,True,True,1.0
925
+ 10.21034/wp.415,10.1080/07350015.1990.10509768,False,True,
926
+ 10.1101/670257,10.1016/j.nlm.2020.107225,False,True,1.0
927
+ 10.2196/preprints.18258,10.2196/18258,False,True,1.0
928
+ 10.5194/amtd-6-3545-2013,10.5194/amt-6-1981-2013,True,True,1.0
929
+ 10.31220/osf.io/pg3v9,10.29255/aksara.v31i2.364.251-268,False,True,1.0
930
+ 10.33767/osf.io/y4s3w,10.7560/vlt8102,False,True,
931
+ 10.20944/preprints202010.0346.v1,10.3390/biom10111564,False,True,1.0
932
+ 10.1101/056044,10.1186/s12918-016-0380-2,False,True,1.0
933
+ 10.5194/angeo-2019-119,10.5194/angeo-38-467-2020,True,True,1.0
934
+ 10.31219/osf.io/vmu6q,10.24269/ars.v6i1.780,False,True,0.8888888888888888
935
+ 10.20944/preprints201805.0072.v1,10.3390/electronics7060079,False,True,1.0
936
+ 10.31224/osf.io/8s59e,10.1504/ijvp.2017.081276,False,True,0.9213085764809904
937
+ 10.20944/preprints201807.0061.v1,10.3390/mti2030044,False,True,1.0
938
+ 10.5194/gmd-2016-315,10.5194/gmd-10-1927-2017,True,True,1.0
939
+ 10.5194/amt-2019-252,10.5194/amt-13-1735-2020,True,True,1.0
940
+ 10.21203/rs.3.rs-41396/v2,10.1186/s12882-020-02158-0,False,True,1.0
941
+ 10.21203/rs.3.rs-41396/v1,10.1186/s12882-020-02158-0,False,True,1.0
942
+ 10.21203/rs.3.rs-41396/v3,10.1186/s12882-020-02158-0,False,True,1.0
943
+ 10.5194/gc-2021-26,10.5194/gc-5-101-2022,True,True,0.9716312056737588
944
+ 10.21203/rs.3.rs-104730/v1,10.1186/s12957-021-02152-2,False,True,0.9976359338061466
945
+ 10.21203/rs.3.rs-104730/v2,10.1186/s12957-021-02152-2,False,True,0.9976359338061466
946
+ 10.1101/634006,10.1093/nargab/lqaa022,False,True,1.0
947
+ 10.1101/481952,10.2174/1568026619666181220111059,False,True,0.9923664122137404
948
+ 10.21203/rs.3.rs-127854/v1,10.1186/s13019-021-01444-8,False,True,1.0
949
+ 10.2196/preprints.17064,10.2196/17064,False,True,1.0
950
+ 10.20944/preprints202102.0539.v1,10.3390/molecules26061667,False,True,0.9855072463768116
951
+ 10.20944/preprints201701.0068.v1,10.3390/su9010122,False,True,1.0
952
+ 10.31219/osf.io/byjhc,10.25046/aj0505120,False,True,
953
+ 10.20944/preprints202005.0163.v1,10.1186/s41205-020-00086-1,False,True,0.9867724867724867
954
+ 10.2196/preprints.12968,10.2196/12968,False,True,1.0
955
+ 10.1101/2020.06.26.169458,10.3390/ijms21217980,False,True,0.9946236559139784
956
+ 10.2196/preprints.11219,10.2196/11219,False,True,1.0
957
+ 10.5194/egusphere-2022-682,10.5194/bg-19-5617-2022,True,True,1.0
958
+ 10.21203/rs.2.11941/v2,10.1186/s13104-019-4593-5,False,True,1.0
959
+ 10.21203/rs.2.11941/v1,10.1186/s13104-019-4593-5,False,True,1.0
960
+ 10.21203/rs.2.11941/v3,10.1186/s13104-019-4593-5,False,True,1.0
961
+ 10.1101/460337,10.1093/cercor/bhaa081,False,True,0.9984917043740572
962
+ 10.1101/199687,10.1016/j.neuropsychologia.2018.06.010,False,True,1.0
963
+ 10.1101/19009589,10.1371/journal.pone.0230274,False,True,1.0
964
+ 10.5194/acp-2017-666,10.5194/acp-18-3779-2018,True,True,0.9942857142857144
965
+ 10.1101/149716,10.1016/j.ymben.2017.11.011,False,True,0.977777777777778
966
+ 10.1101/2020.04.16.044842,10.1523/jneurosci.0875-20.2020,False,True,1.0
967
+ 10.5194/acp-2016-998,10.5194/acp-17-4419-2017,True,True,1.0
968
+ 10.5194/bg-2019-165,10.5194/bg-16-4097-2019,True,True,0.9865591397849464
969
+ 10.5194/bg-2017-173,10.5194/bg-15-953-2018,True,True,1.0
970
+ 10.5194/acp-2020-875,10.5194/acp-21-10337-2021,True,True,1.0
971
+ 10.31224/osf.io/5atbz,10.1016/j.flowmeasinst.2018.07.003,False,True,
972
+ 10.1101/251843,10.1523/eneuro.0381-18.2018,False,True,0.9583333333333334
973
+ 10.21203/rs.3.rs-253126/v1,10.1002/adpr.202100285,False,True,0.989010989010989
974
+ 10.20944/preprints201710.0032.v1,10.3390/environments4040088,False,True,0.9069781480140046
975
+ 10.20944/preprints201704.0135.v1,10.3390/ijms18050923,False,True,1.0
976
+ 10.1101/2022.01.14.476419,10.1111/mec.16469,False,True,1.0
977
+ 10.5194/bgd-4-3343-2007,10.5194/bg-5-371-2008,True,True,0.9743589743589745
978
+ 10.5194/amt-2017-408,10.5194/amt-11-3251-2018,True,True,1.0
979
+ 10.5194/osd-11-1543-2014,10.5194/os-11-187-2015,True,True,1.0
980
+ 10.20944/preprints201906.0228.v1,10.3390/cancers11070942,False,True,0.9832134292565948
981
+ 10.5194/cp-2016-46,10.5194/cp-12-1829-2016,True,True,1.0
982
+ 10.5194/amt-2017-287,10.5194/amt-11-4465-2018,True,True,1.0
983
+ 10.1101/2021.01.28.428594,10.1172/jci.insight.147700,False,True,0.9786096256684492
984
+ 10.1101/852434,10.1093/nar/gkaa032,False,True,0.9975308641975308
985
+ 10.1101/2021.06.21.449154,10.7554/elife.73153,True,True,1.0
986
+ 10.5194/angeo-2018-21,10.5194/angeo-36-891-2018,True,True,0.9968847352024922
987
+ 10.31230/osf.io/3b2c9,10.3354/meps12774,False,True,
988
+ 10.1101/2022.05.17.492323,10.7554/elife.78877,True,False,1.0
989
+ 10.5194/osd-11-1213-2014,10.5194/os-10-881-2014,True,True,0.9857295482295484
990
+ 10.5194/acp-2020-1095,10.5194/acp-21-5289-2021,True,True,1.0
991
+ 10.5194/egusphere-2022-481,10.5194/os-18-1665-2022,True,True,1.0
992
+ 10.5194/amt-2020-348,10.5194/amt-14-5349-2021,True,True,0.9696969696969696
993
+ 10.31223/osf.io/5wakg,10.1111/j.1365-246x.2006.03017.x,False,True,
994
+ 10.5194/amt-2020-28,10.5194/amt-13-6559-2020,True,True,1.0
995
+ 10.5194/tc-2019-28,10.5194/tc-13-3337-2019,True,True,1.0
996
+ 10.1101/2020.12.17.423361,10.7554/elife.66194,True,True,1.0
997
+ 10.20944/preprints201808.0242.v1,10.3390/nano8090725,False,True,0.9592592592592591
998
+ 10.1101/2021.06.09.21258556,10.1016/s2213-2600(21)00409-4,False,True,
999
+ 10.5194/hess-2016-505,10.5194/hess-21-765-2017,True,True,1.0
1000
+ 10.5194/amtd-7-4481-2014,10.5194/amt-7-3549-2014,True,True,1.0
1001
+ 10.21203/rs.3.rs-2267501/v1,10.1007/s11356-023-27197-6,False,True,0.9909297052154196
fetch_positive_samples.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.GoodDataset import *
2
+
3
+ import argparse
4
+
5
+ def main(config):
6
+ """
7
+ Main function to process the dataset and save it as a CSV file.
8
+ Args:
9
+ config: Namespace object containing the script arguments.
10
+ """
11
+ # Initialize the dataset
12
+ dataset = AugmentedDataset()
13
+
14
+ positive_samples = dataset.fetch_positive_samples_parallel(
15
+ num_samples=config.size,
16
+ random=config.random,
17
+ seed=config.seed,
18
+ full=config.full
19
+ )
20
+
21
+ dataset.save(config.output)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ # Parse command-line arguments
26
+ from src.utils.io_utils import PROJECT_ROOT
27
+ parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.")
28
+
29
+ parser.add_argument("-s", "--size", type=int, default=10, help="Number of samples to generate.")
30
+ parser.add_argument("-r", "--random", type=bool, default=True, help="Whether to sample randomly.")
31
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.")
32
+ parser.add_argument("--full", action="store_true", help="Boolean flag to indicate full dataset mode.")
33
+ parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/dataset.pkl"), help="Output file path to save the dataset as a CSV.")
34
+
35
+ # Parse the arguments and pass to the main function
36
+ config = parser.parse_args()
37
+ main(config)
logo.png ADDED
notebooks/.DS_Store ADDED
Binary file (6.15 kB). View file
 
notebooks/1-0-dataset_development.ipynb ADDED
@@ -0,0 +1,1121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 16,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from enum import Enum\n",
10
+ "from typing import List, Dict, Any\n",
11
+ "from dataclasses import dataclass\n",
12
+ "from tqdm import tqdm\n",
13
+ "\n",
14
+ "import os\n",
15
+ "import yaml\n",
16
+ "import pandas as pd\n",
17
+ "import numpy as np\n",
18
+ "\n",
19
+ "import pyalex\n",
20
+ "from pyalex import Works\n",
21
+ "from src.utils.io_utils import PROJECT_ROOT\n",
22
+ "from src.dataset.Dataset import *"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {},
28
+ "source": [
29
+ "# Configurations"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 2,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "@dataclass\n",
39
+ "class ConfigAugmentation:\n",
40
+ " \"\"\"Configuration for OpenAlex features\"\"\"\n",
41
+ " basic: Dict[str, bool] = None # id, doi, title, etc\n",
42
+ " source: Dict[str, bool] = None # journal info\n",
43
+ " authors: Dict[str, bool] = None # author details\n",
44
+ " metrics: Dict[str, bool] = None # citations, fwci, etc\n",
45
+ " classification: Dict[str, bool] = None # topics, concepts\n",
46
+ " access: Dict[str, bool] = None # OA status\n",
47
+ " related_works: Dict[str, bool] = None # references\n",
48
+ " abstract: bool = False"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "metadata": {},
54
+ "source": [
55
+ "# Dataset Loading "
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "# load the dataset \n",
65
+ "\n",
66
+ "class DatasetType(Enum):\n",
67
+ " FULL_RAW = \"full_raw\"\n",
68
+ " PARTIAL_RAW = \"partial_raw\"\n",
69
+ " FULL_AUGMENTED = \"full_augmented\"\n",
70
+ " PARTIAL_AUGMENTED = \"partial_augmented\""
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 5,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "@dataclass\n",
80
+ "class Field:\n",
81
+ " \"\"\"Field configuration for data extraction\"\"\"\n",
82
+ " name: str\n",
83
+ " path: List[str]\n",
84
+ " default: Any = None\n",
85
+ "\n",
86
+ "class AlexFields:\n",
87
+ " \"\"\"OpenAlex field definitions\"\"\"\n",
88
+ " \n",
89
+ " BASIC = [\n",
90
+ " Field(\"id\", [\"id\"]),\n",
91
+ " Field(\"doi\", [\"doi\"]),\n",
92
+ " Field(\"title\", [\"title\"]),\n",
93
+ " Field(\"display_name\", [\"display_name\"]),\n",
94
+ " Field(\"publication_year\", [\"publication_year\"]),\n",
95
+ " Field(\"publication_date\", [\"publication_date\"]),\n",
96
+ " Field(\"language\", [\"language\"]),\n",
97
+ " Field(\"type\", [\"type\"]),\n",
98
+ " Field(\"type_crossref\", [\"type_crossref\"])\n",
99
+ " ]\n",
100
+ " \n",
101
+ " SOURCE = [\n",
102
+ " Field(\"journal_name\", [\"primary_location\", \"source\", \"display_name\"]),\n",
103
+ " Field(\"issn\", [\"primary_location\", \"source\", \"issn\"]),\n",
104
+ " Field(\"issn_l\", [\"primary_location\", \"source\", \"issn_l\"]),\n",
105
+ " Field(\"publisher\", [\"primary_location\", \"source\", \"host_organization_name\"]),\n",
106
+ " Field(\"type\", [\"primary_location\", \"source\", \"type\"])\n",
107
+ " ]\n",
108
+ "\n",
109
+ " METRICS = [\n",
110
+ " Field(\"cited_by_count\", [\"cited_by_count\"]),\n",
111
+ " Field(\"cited_by_percentile\", [\"citation_normalized_percentile\"]),\n",
112
+ " Field(\"is_retracted\", [\"is_retracted\"]),\n",
113
+ " Field(\"fwci\", [\"fwci\"]),\n",
114
+ " Field(\"referenced_works_count\", [\"referenced_works_count\"])\n",
115
+ " ]\n",
116
+ "\n",
117
+ " ACCESS = [\n",
118
+ " Field(\"is_oa\", [\"open_access\", \"is_oa\"]),\n",
119
+ " Field(\"oa_status\", [\"open_access\", \"oa_status\"]),\n",
120
+ " Field(\"oa_url\", [\"open_access\", \"oa_url\"]),\n",
121
+ " Field(\"pdf_url\", [\"primary_location\", \"pdf_url\"]),\n",
122
+ " Field(\"license\", [\"primary_location\", \"license\"]) \n",
123
+ " ]\n",
124
+ "\n",
125
+ "def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any:\n",
126
+ " \"\"\"Extract nested value from dictionary using path\"\"\"\n",
127
+ " value = data\n",
128
+ " for key in path:\n",
129
+ " try:\n",
130
+ " value = value[key]\n",
131
+ " except (KeyError, TypeError):\n",
132
+ " return default\n",
133
+ " return value\n",
134
+ "\n",
135
+ "class DataAugmenter:\n",
136
+ " \"\"\"Class for augmenting data with OpenAlex features\"\"\"\n",
137
+ "\n",
138
+ " def __init__(self):\n",
139
+ " \"\"\"Initialize augmenter with API credentials\"\"\"\n",
140
+ " self.profile = self._load_profile()\n",
141
+ " self.email = self.profile[\"email\"]\n",
142
+ " self.filters = ConfigAugmentation(\n",
143
+ " basic={\n",
144
+ " \"id\": True,\n",
145
+ " \"doi\": True,\n",
146
+ " \"title\": True,\n",
147
+ " \"display_name\": True,\n",
148
+ " \"publication_year\": True,\n",
149
+ " \"publication_date\": True,\n",
150
+ " \"language\": True,\n",
151
+ " \"type\": True,\n",
152
+ " \"type_crossref\": True\n",
153
+ " },\n",
154
+ " source={\n",
155
+ " \"journal_name\": True,\n",
156
+ " \"issn\": True,\n",
157
+ " \"issn_l\": True,\n",
158
+ " \"publisher\": True,\n",
159
+ " \"type\": True\n",
160
+ " },\n",
161
+ " authors={\n",
162
+ " \"position\": True,\n",
163
+ " \"name\": True,\n",
164
+ " \"id\": True,\n",
165
+ " \"orcid\": True,\n",
166
+ " \"is_corresponding\": True,\n",
167
+ " \"affiliations\": False\n",
168
+ " },\n",
169
+ " metrics={\n",
170
+ " \"cited_by_count\": True,\n",
171
+ " \"cited_by_percentile\": False,\n",
172
+ " \"is_retracted\": True,\n",
173
+ " \"fwci\": True,\n",
174
+ " \"referenced_works_count\": True\n",
175
+ " },\n",
176
+ " classification={\n",
177
+ " \"primary_topic\": True,\n",
178
+ " \"topics\": False,\n",
179
+ " \"concepts\": False,\n",
180
+ " },\n",
181
+ " access={\n",
182
+ " \"is_oa\": True,\n",
183
+ " \"oa_status\": True,\n",
184
+ " \"oa_url\": True,\n",
185
+ " \"pdf_url\": True,\n",
186
+ " \"license\": True\n",
187
+ " },\n",
188
+ " related_works={\n",
189
+ " \"references\": True,\n",
190
+ " \"referenced_by_count\": True,\n",
191
+ " \"related\": True\n",
192
+ " },\n",
193
+ " abstract=True\n",
194
+ " )\n",
195
+ " \n",
196
+ " pyalex.config.email = self.email\n",
197
+ " \n",
198
+ " def _load_profile(self) -> Dict[str, str]:\n",
199
+ " \"\"\"Load API credentials from profile\"\"\"\n",
200
+ " profile_path = f\"{PROJECT_ROOT}/user_information/profile.yaml\"\n",
201
+ " \n",
202
+ " assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n",
203
+ " assert os.path.exists(profile_path), \"create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/\"\n",
204
+ "\n",
205
+ " \n",
206
+ " with open(profile_path, \"r\") as f:\n",
207
+ " profile = yaml.safe_load(f)\n",
208
+ " \n",
209
+ " return {\n",
210
+ " \"email\": profile[\"email\"]\n",
211
+ " }\n",
212
+ "\n",
213
+ " def get_alex_features(self, doi: str) -> Dict:\n",
214
+ " \"\"\"Extract all OpenAlex features for a DOI\"\"\"\n",
215
+ " try:\n",
216
+ " work = Works()[f\"https://doi.org/{doi}\"]\n",
217
+ " result = {}\n",
218
+ "\n",
219
+ " # Basic metadata\n",
220
+ " result[\"basic\"] = {\n",
221
+ " field.name: get_nested_value(work, field.path, None)\n",
222
+ " for field in AlexFields.BASIC\n",
223
+ " }\n",
224
+ " \n",
225
+ " # Source/journal info\n",
226
+ " result[\"source\"] = {\n",
227
+ " field.name: get_nested_value(work, field.path, None)\n",
228
+ " for field in AlexFields.SOURCE\n",
229
+ " }\n",
230
+ " \n",
231
+ " # Authors with affiliations\n",
232
+ " try:\n",
233
+ " result[\"authors\"] = [\n",
234
+ " {\n",
235
+ " \"position\": auth.get(\"author_position\", None),\n",
236
+ " \"name\": auth.get(\"author\", {}).get(\"display_name\", None),\n",
237
+ " \"id\": auth.get(\"author\", {}).get(\"id\", None),\n",
238
+ " \"orcid\": auth.get(\"author\", {}).get(\"orcid\", None),\n",
239
+ " \"is_corresponding\": auth.get(\"is_corresponding\", None),\n",
240
+ " \"affiliations\": [\n",
241
+ " {\n",
242
+ " \"name\": inst.get(\"display_name\", None),\n",
243
+ " \"id\": inst.get(\"id\", None),\n",
244
+ " \"country\": inst.get(\"country_code\", None),\n",
245
+ " \"type\": inst.get(\"type\", None),\n",
246
+ " \"ror\": inst.get(\"ror\", None)\n",
247
+ " }\n",
248
+ " for inst in auth.get(\"institutions\", [])\n",
249
+ " ]\n",
250
+ " }\n",
251
+ " for auth in work.get(\"authorships\", [])\n",
252
+ " ]\n",
253
+ " except:\n",
254
+ " result[\"authors\"] = None\n",
255
+ "\n",
256
+ " # Topics and classifications \n",
257
+ " try:\n",
258
+ " result[\"classification\"] = {\n",
259
+ " \"primary_topic\": {\n",
260
+ " \"name\": work.get(\"primary_topic\", {}).get(\"display_name\", None),\n",
261
+ " \"score\": work.get(\"primary_topic\", {}).get(\"score\", None),\n",
262
+ " \"field\": work.get(\"primary_topic\", {}).get(\"field\", {}).get(\"display_name\", None),\n",
263
+ " \"subfield\": work.get(\"primary_topic\", {}).get(\"subfield\", {}).get(\"display_name\", None)\n",
264
+ " },\n",
265
+ " \"topics\": [\n",
266
+ " {\n",
267
+ " \"name\": topic.get(\"display_name\", None),\n",
268
+ " \"score\": topic.get(\"score\", None),\n",
269
+ " \"field\": topic.get(\"field\", {}).get(\"display_name\", None)\n",
270
+ " }\n",
271
+ " for topic in work.get(\"topics\", [])\n",
272
+ " ],\n",
273
+ " \"concepts\": [\n",
274
+ " {\n",
275
+ " \"name\": concept.get(\"display_name\", None),\n",
276
+ " \"level\": concept.get(\"level\", None),\n",
277
+ " \"score\": concept.get(\"score\", None),\n",
278
+ " \"wikidata\": concept.get(\"wikidata\", None)\n",
279
+ " }\n",
280
+ " for concept in work.get(\"concepts\", [])\n",
281
+ " ]\n",
282
+ " }\n",
283
+ " except:\n",
284
+ " result[\"classification\"] = None\n",
285
+ "\n",
286
+ " # Metrics\n",
287
+ " result[\"metrics\"] = {\n",
288
+ " field.name: get_nested_value(work, field.path, None)\n",
289
+ " for field in AlexFields.METRICS\n",
290
+ " }\n",
291
+ "\n",
292
+ " # Access info\n",
293
+ " result[\"access\"] = {\n",
294
+ " field.name: get_nested_value(work, field.path, None)\n",
295
+ " for field in AlexFields.ACCESS\n",
296
+ " }\n",
297
+ "\n",
298
+ " # Abstract\n",
299
+ " try:\n",
300
+ " if \"abstract_inverted_index\" in work:\n",
301
+ " abstract_dict = work[\"abstract_inverted_index\"]\n",
302
+ " if abstract_dict:\n",
303
+ " max_pos = max(max(positions) for positions in abstract_dict.values())\n",
304
+ " words = [\"\"] * (max_pos + 1)\n",
305
+ " for word, positions in abstract_dict.items():\n",
306
+ " for pos in positions:\n",
307
+ " words[pos] = word\n",
308
+ " result[\"abstract\"] = \" \".join(words)\n",
309
+ " else:\n",
310
+ " result[\"abstract\"] = None\n",
311
+ " else:\n",
312
+ " result[\"abstract\"] = None\n",
313
+ " except:\n",
314
+ " result[\"abstract\"] = None\n",
315
+ "\n",
316
+ " return result\n",
317
+ "\n",
318
+ " except Exception as e:\n",
319
+ " print(f\"OpenAlex error for DOI {doi}: {e}\")\n",
320
+ " return {}\n",
321
+ " \n",
322
+ " def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]:\n",
323
+ " \"\"\"Filter data based on configuration\n",
324
+ " \n",
325
+ " Args:\n",
326
+ " data: Dictionary containing raw data\n",
327
+ " config: Configuration specifying which features to include\n",
328
+ " \n",
329
+ " Returns:\n",
330
+ " Filtered dictionary containing only the configured features\n",
331
+ " \"\"\"\n",
332
+ " config = config or self.filters\n",
333
+ " \n",
334
+ " def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]:\n",
335
+ " \"\"\"Filter a section of the data based on the section configuration\"\"\"\n",
336
+ " return {k: v for k, v in section_data.items() if k in section_config and section_config[k]}\n",
337
+ " \n",
338
+ " filtered_data = {}\n",
339
+ " \n",
340
+ " # Filter OpenAlex data\n",
341
+ " alex_filtered = {}\n",
342
+ " \n",
343
+ " # Basic metadata\n",
344
+ " if config.basic:\n",
345
+ " alex_filtered[\"basic\"] = filter_section(data.get(\"basic\", {}), config.basic)\n",
346
+ " \n",
347
+ " # Source/journal info\n",
348
+ " if config.source:\n",
349
+ " alex_filtered[\"source\"] = filter_section(data.get(\"source\", {}), config.source)\n",
350
+ " \n",
351
+ " # Authors\n",
352
+ " if config.authors:\n",
353
+ " authors_data = data.get(\"authors\", [])\n",
354
+ " filtered_authors = []\n",
355
+ " for author in authors_data:\n",
356
+ " filtered_author = filter_section(author, config.authors)\n",
357
+ " if config.authors.get(\"affiliations\", False):\n",
358
+ " print(author.get(\"affiliations\", []))\n",
359
+ " filtered_author[\"affiliations\"] = [\n",
360
+ " filter_section(aff, config.authors[\"affiliations\"])\n",
361
+ " for aff in author.get(\"affiliations\", [])\n",
362
+ " ]\n",
363
+ " filtered_authors.append(filtered_author)\n",
364
+ " alex_filtered[\"authors\"] = filtered_authors\n",
365
+ " \n",
366
+ " # Metrics\n",
367
+ " if config.metrics:\n",
368
+ " alex_filtered[\"metrics\"] = filter_section(data.get(\"metrics\", {}), config.metrics)\n",
369
+ " \n",
370
+ " # Classification\n",
371
+ " if config.classification:\n",
372
+ " classification_data = data.get(\"classification\", {})\n",
373
+ " alex_filtered[\"classification\"] = {\n",
374
+ " k: v for k, v in classification_data.items() if k in config.classification and config.classification[k]\n",
375
+ " }\n",
376
+ " \n",
377
+ " # Access info\n",
378
+ " if config.access:\n",
379
+ " alex_filtered[\"access\"] = filter_section(data.get(\"access\", {}), config.access)\n",
380
+ " \n",
381
+ " # Related works\n",
382
+ " if config.related_works:\n",
383
+ " alex_filtered[\"related_works\"] = filter_section(data.get(\"related_works\", {}), config.related_works)\n",
384
+ " \n",
385
+ " # Abstract\n",
386
+ " if config.abstract and \"abstract\" in data:\n",
387
+ " alex_filtered[\"abstract\"] = data[\"abstract\"]\n",
388
+ " \n",
389
+ " filtered_data = alex_filtered\n",
390
+ " \n",
391
+ " return filtered_data"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": 6,
397
+ "metadata": {},
398
+ "outputs": [
399
+ {
400
+ "data": {
401
+ "text/plain": [
402
+ "{'primary_topic': {'name': 'Gait Analysis and Fall Prevention in Elderly',\n",
403
+ " 'score': 0.9994,\n",
404
+ " 'field': 'Health Professions',\n",
405
+ " 'subfield': 'Physical Therapy, Sports Therapy and Rehabilitation'}}"
406
+ ]
407
+ },
408
+ "execution_count": 6,
409
+ "metadata": {},
410
+ "output_type": "execute_result"
411
+ }
412
+ ],
413
+ "source": [
414
+ "doi = \"10.2196/41082\"\n",
415
+ "a = DataAugmenter()\n",
416
+ "info = a.get_alex_features(doi)\n",
417
+ "filtered_info = a.filter_augmented_data(info)\n",
418
+ "filtered_info[\"classification\"]"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 7,
424
+ "metadata": {},
425
+ "outputs": [],
426
+ "source": [
427
+ "class FullAugmentedDataset: \n",
428
+ "\n",
429
+ " def __init__(self):\n",
430
+ " self.augmenter = DataAugmenter()\n",
431
+ " self.full_raw_dataset = self._load_the_dataset()\n",
432
+ "\n",
433
+ " def _load_the_dataset(self, type: DatasetType = DatasetType.FULL_RAW) -> pd.DataFrame:\n",
434
+ " \"\"\"Load as csv file one of the datasets for training.\"\"\"\n",
435
+ " assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n",
436
+ " \n",
437
+ " if type == DatasetType.FULL_RAW:\n",
438
+ " return pd.read_csv(f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\")\n",
439
+ "\n",
440
+ " def retrieve_dois_couple(self, len: int = 1, random: bool = False, seed: bool = None, full: bool = False):\n",
441
+ " \"\"\"Retrieve two DOIs from the dataset\"\"\"\n",
442
+ " if random:\n",
443
+ " dois = self.full_raw_dataset.sample(n=len, random_state=seed)[[\"preprint_doi\", \"article_doi\"]]\n",
444
+ " else:\n",
445
+ " dois = self.full_raw_dataset.head(len)[[\"preprint_doi\", \"article_doi\"]]\n",
446
+ " if full:\n",
447
+ " dois = self.full_raw_dataset[[\"preprint_doi\", \"article_doi\"]]\n",
448
+ " return dois.to_numpy()\n",
449
+ " \n",
450
+ " @staticmethod\n",
451
+ " def _flatten_list(lst):\n",
452
+ " \"\"\"\n",
453
+ " Flattens a nested list into a single list. If the input is not nested, it returns the original list.\n",
454
+ " Handles cases where some elements are lists and others are not.\n",
455
+ " \"\"\"\n",
456
+ " if not isinstance(lst, list): # Ensure the input is a list\n",
457
+ " raise ValueError(\"Input must be a list\")\n",
458
+ "\n",
459
+ " def _flatten(sublist):\n",
460
+ " for item in sublist:\n",
461
+ " if isinstance(item, list): # Check if the item is a list\n",
462
+ " yield from _flatten(item) # Recursively flatten the list\n",
463
+ " else:\n",
464
+ " yield item # Yield the non-list item\n",
465
+ "\n",
466
+ " return list(_flatten(lst))\n",
467
+ " \n",
468
+ " def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.Series:\n",
469
+ " \"\"\"Transform filtered augmented data into a pandas Series\n",
470
+ " \n",
471
+ " Args:\n",
472
+ " filtered_data: Dictionary containing filtered OpenAlex and Elsevier data\n",
473
+ " preprint: If True, use prpnt_ prefix, else use article_ prefix\n",
474
+ " \n",
475
+ " Returns:\n",
476
+ " pd.Series: Flattened data as a single row\n",
477
+ " \"\"\"\n",
478
+ "\n",
479
+ " additional_part = FullAugmentedDataset.filter_author(filtered_data.get(\"authors\",{}))\n",
480
+ " # modify the key of additional part by adding authors_ at the beginning\n",
481
+ " additional_part = {f\"authors_{k}\": v for k, v in additional_part.items()} \n",
482
+ " # remove authos key from filtreed_info\n",
483
+ " filtered_data.pop(\"authors\")\n",
484
+ " # append the additional part to the filtered_info\n",
485
+ " filtered_data.update(additional_part)\n",
486
+ " final_dictionary = FullAugmentedDataset.flatten_dict(filtered_data, preprint=preprint)\n",
487
+ "\n",
488
+ " for k, v in final_dictionary.items():\n",
489
+ " final_dictionary[k] = \"$@$\".join(map(str, FullAugmentedDataset._flatten_list(v))) if isinstance(v, list) else [v]\n",
490
+ "\n",
491
+ " return pd.DataFrame(final_dictionary)\n",
492
+ "\n",
493
+ " @staticmethod\n",
494
+ " def filter_author(authors_info : list) -> dict:\n",
495
+ "\n",
496
+ " try:\n",
497
+ " relevant_keys = authors_info[0].keys()\n",
498
+ " new_dict = {}\n",
499
+ " for key in relevant_keys:\n",
500
+ " new_dict[key] = [author[key] for author in authors_info]\n",
501
+ " return new_dict\n",
502
+ " except:\n",
503
+ " return {}\n",
504
+ " \n",
505
+ " @staticmethod\n",
506
+ " def flatten_dict(d: dict, parent_key: str = '', sep: str = '_', preprint = True) -> dict:\n",
507
+ " \"\"\"Flatten a nested dictionary.\n",
508
+ " \n",
509
+ " Args:\n",
510
+ " d (dict): The dictionary to flatten.\n",
511
+ " parent_key (str): The base key string to use for the flattened keys.\n",
512
+ " sep (str): The separator to use between parent and child keys.\n",
513
+ " \n",
514
+ " Returns:\n",
515
+ " dict: The flattened dictionary.\n",
516
+ " \"\"\"\n",
517
+ " addition = \"prpnt_\" if preprint else \"article_\"\n",
518
+ " def _flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:\n",
519
+ " items = []\n",
520
+ " for k, v in d.items():\n",
521
+ " new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n",
522
+ " if isinstance(v, dict):\n",
523
+ " items.extend(_flatten_dict(v, new_key, sep=sep).items())\n",
524
+ " else:\n",
525
+ " items.append((new_key, v))\n",
526
+ " return dict(items)\n",
527
+ " return {f\"{addition}{k}\": v for k, v in _flatten_dict(d, parent_key, sep).items()}\n",
528
+ "\n",
529
+ " def process_pair(self, dois) -> pd.DataFrame:\n",
530
+ " \"\"\"Process a pair of DOIs and return combined rows as a DataFrame\"\"\"\n",
531
+ " assert len(dois) > 0\n",
532
+ " rows = []\n",
533
+ " for preprint_doi, article_doi in tqdm(dois):\n",
534
+ " # Get preprint features\n",
535
+ " preprint_features = self.augmenter.get_alex_features(preprint_doi) # augment with all the features\n",
536
+ " preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) # filter the relevant features\n",
537
+ " preprint_row = self._augmented_data_to_row(preprint_filtered, True)\n",
538
+ "\n",
539
+ " # Get article features\n",
540
+ " article_features = self.augmenter.get_alex_features(article_doi) # augment with all the features\n",
541
+ " article_filtered = self.augmenter.filter_augmented_data(article_features)\n",
542
+ " article_row = self._augmented_data_to_row(article_filtered, False)\n",
543
+ "\n",
544
+ " # print(article_row.columns)\n",
545
+ " # print(len(preprint_row.columns))\n",
546
+ "\n",
547
+ " # combined_row = pd.concat([preprint_row, article_row], axis=1)\n",
548
+ " # rows.append(combined_row)\n",
549
+ " rows.append([preprint_row, article_row])\n",
550
+ "\n",
551
+ " return rows\n",
552
+ "\n",
553
+ " @staticmethod\n",
554
+ " def transform_array(input_array, factor):\n",
555
+ " output_list = []\n",
556
+ " \n",
557
+ " for i, row in enumerate(input_array):\n",
558
+ " other_indices = np.array([j for j in range(len(input_array)) if j != i])\n",
559
+ " sampled_indices = np.random.choice(other_indices, size=factor, replace=False)\n",
560
+ " sampled_rows = [input_array[j] for j in sampled_indices]\n",
561
+ "\n",
562
+ " output_list.append(pd.concat([row[0], row[1], pd.DataFrame(data=[1], columns=['label'])], axis=1))\n",
563
+ " for B in sampled_rows:\n",
564
+ " output_list.append(pd.concat([row[0], B[1], pd.DataFrame(data=[0], columns=['label'])], axis=1))\n",
565
+ "\n",
566
+ " return pd.concat(output_list).reset_index(drop=True)\n",
567
+ "\n",
568
+ " def get_full_dataset(self, len: int = 1, random: bool = True, seed: int = 42, full: bool = True) -> pd.DataFrame:\n",
569
+ " \"\"\"Process all DOI pairs and return full dataset\"\"\"\n",
570
+ " dois = self.retrieve_dois_couple(len, random, seed, full)\n",
571
+ " self.augmented_df = FullAugmentedDataset.transform_array(self.process_pair(dois), factor=3)\n",
572
+ " return self.augmented_df"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "markdown",
577
+ "metadata": {},
578
+ "source": [
579
+ "# TRYING STUFF"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 8,
585
+ "metadata": {},
586
+ "outputs": [],
587
+ "source": [
588
+ "# Create dataset with new configs\n",
589
+ "dataset = FullAugmentedDataset()"
590
+ ]
591
+ },
592
+ {
593
+ "cell_type": "code",
594
+ "execution_count": 9,
595
+ "metadata": {},
596
+ "outputs": [
597
+ {
598
+ "data": {
599
+ "text/plain": [
600
+ "(5, 2)"
601
+ ]
602
+ },
603
+ "execution_count": 9,
604
+ "metadata": {},
605
+ "output_type": "execute_result"
606
+ }
607
+ ],
608
+ "source": [
609
+ "dois = dataset.retrieve_dois_couple(5, random = True, seed = 42)\n",
610
+ "dois.shape"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": 10,
616
+ "metadata": {},
617
+ "outputs": [],
618
+ "source": [
619
+ "info = dataset.augmenter.get_alex_features(dois[0][0]) # augment with all the features"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": 11,
625
+ "metadata": {},
626
+ "outputs": [
627
+ {
628
+ "data": {
629
+ "text/plain": [
630
+ "{'basic': {'id': 'https://openalex.org/W4213260597',\n",
631
+ " 'doi': 'https://doi.org/10.31234/osf.io/6fps2',\n",
632
+ " 'title': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n",
633
+ " 'display_name': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n",
634
+ " 'publication_year': 2022,\n",
635
+ " 'publication_date': '2022-02-12',\n",
636
+ " 'language': 'en',\n",
637
+ " 'type': 'preprint',\n",
638
+ " 'type_crossref': 'posted-content'},\n",
639
+ " 'source': {'journal_name': None,\n",
640
+ " 'issn': None,\n",
641
+ " 'issn_l': None,\n",
642
+ " 'publisher': None,\n",
643
+ " 'type': None},\n",
644
+ " 'authors': [{'position': 'first',\n",
645
+ " 'name': 'Jonathan W. Kelly',\n",
646
+ " 'id': 'https://openalex.org/A5011931977',\n",
647
+ " 'orcid': 'https://orcid.org/0000-0002-4317-273X',\n",
648
+ " 'is_corresponding': True,\n",
649
+ " 'affiliations': [{'name': 'Iowa State University',\n",
650
+ " 'id': 'https://openalex.org/I173911158',\n",
651
+ " 'country': 'US',\n",
652
+ " 'type': 'education',\n",
653
+ " 'ror': 'https://ror.org/04rswrd78'}]}],\n",
654
+ " 'classification': {'primary_topic': {'name': 'Virtual Presence and Embodiment in VR Research',\n",
655
+ " 'score': 0.9982,\n",
656
+ " 'field': 'Computer Science',\n",
657
+ " 'subfield': 'Human-Computer Interaction'},\n",
658
+ " 'topics': [{'name': 'Virtual Presence and Embodiment in VR Research',\n",
659
+ " 'score': 0.9982,\n",
660
+ " 'field': 'Computer Science'},\n",
661
+ " {'name': 'Neural Mechanisms of Visual Perception and Processing',\n",
662
+ " 'score': 0.9906,\n",
663
+ " 'field': 'Neuroscience'},\n",
664
+ " {'name': 'Spatial Ability for STEM Domains',\n",
665
+ " 'score': 0.9727,\n",
666
+ " 'field': 'Engineering'}],\n",
667
+ " 'concepts': [{'name': 'Virtual reality',\n",
668
+ " 'level': 2,\n",
669
+ " 'score': 0.74525213,\n",
670
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q170519'},\n",
671
+ " {'name': 'Perception',\n",
672
+ " 'level': 2,\n",
673
+ " 'score': 0.69497585,\n",
674
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q160402'},\n",
675
+ " {'name': 'Optical head-mounted display',\n",
676
+ " 'level': 2,\n",
677
+ " 'score': 0.64143133,\n",
678
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q17105103'},\n",
679
+ " {'name': 'Computer science',\n",
680
+ " 'level': 0,\n",
681
+ " 'score': 0.4773505,\n",
682
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q21198'},\n",
683
+ " {'name': 'Psychology',\n",
684
+ " 'level': 0,\n",
685
+ " 'score': 0.3757282,\n",
686
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q9418'},\n",
687
+ " {'name': 'Computer vision',\n",
688
+ " 'level': 1,\n",
689
+ " 'score': 0.3722988,\n",
690
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q844240'},\n",
691
+ " {'name': 'Artificial intelligence',\n",
692
+ " 'level': 1,\n",
693
+ " 'score': 0.35102686,\n",
694
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q11660'},\n",
695
+ " {'name': 'Neuroscience',\n",
696
+ " 'level': 1,\n",
697
+ " 'score': 0.0,\n",
698
+ " 'wikidata': 'https://www.wikidata.org/wiki/Q207011'}]},\n",
699
+ " 'metrics': {'cited_by_count': 6,\n",
700
+ " 'cited_by_percentile': {'value': 0.997093,\n",
701
+ " 'is_in_top_1_percent': True,\n",
702
+ " 'is_in_top_10_percent': True},\n",
703
+ " 'is_retracted': False,\n",
704
+ " 'fwci': None,\n",
705
+ " 'referenced_works_count': 89},\n",
706
+ " 'access': {'is_oa': True,\n",
707
+ " 'oa_status': 'green',\n",
708
+ " 'oa_url': 'https://psyarxiv.com/6fps2/download',\n",
709
+ " 'pdf_url': 'https://psyarxiv.com/6fps2/download',\n",
710
+ " 'license': None},\n",
711
+ " 'abstract': 'Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis of 131 studies describes egocentric distance perception across 20 HMDs, and also examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.'}"
712
+ ]
713
+ },
714
+ "execution_count": 11,
715
+ "metadata": {},
716
+ "output_type": "execute_result"
717
+ }
718
+ ],
719
+ "source": [
720
+ "info"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": 12,
726
+ "metadata": {},
727
+ "outputs": [],
728
+ "source": [
729
+ "filtered_info = dataset.augmenter.filter_augmented_data(info)"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "execution_count": 13,
735
+ "metadata": {},
736
+ "outputs": [
737
+ {
738
+ "data": {
739
+ "text/html": [
740
+ "<div>\n",
741
+ "<style scoped>\n",
742
+ " .dataframe tbody tr th:only-of-type {\n",
743
+ " vertical-align: middle;\n",
744
+ " }\n",
745
+ "\n",
746
+ " .dataframe tbody tr th {\n",
747
+ " vertical-align: top;\n",
748
+ " }\n",
749
+ "\n",
750
+ " .dataframe thead th {\n",
751
+ " text-align: right;\n",
752
+ " }\n",
753
+ "</style>\n",
754
+ "<table border=\"1\" class=\"dataframe\">\n",
755
+ " <thead>\n",
756
+ " <tr style=\"text-align: right;\">\n",
757
+ " <th></th>\n",
758
+ " <th>prpnt_basic_id</th>\n",
759
+ " <th>prpnt_basic_doi</th>\n",
760
+ " <th>prpnt_basic_title</th>\n",
761
+ " <th>prpnt_basic_display_name</th>\n",
762
+ " <th>prpnt_basic_publication_year</th>\n",
763
+ " <th>prpnt_basic_publication_date</th>\n",
764
+ " <th>prpnt_basic_language</th>\n",
765
+ " <th>prpnt_basic_type</th>\n",
766
+ " <th>prpnt_basic_type_crossref</th>\n",
767
+ " <th>prpnt_source_journal_name</th>\n",
768
+ " <th>...</th>\n",
769
+ " <th>prpnt_access_oa_status</th>\n",
770
+ " <th>prpnt_access_oa_url</th>\n",
771
+ " <th>prpnt_access_pdf_url</th>\n",
772
+ " <th>prpnt_access_license</th>\n",
773
+ " <th>prpnt_abstract</th>\n",
774
+ " <th>prpnt_authors_position</th>\n",
775
+ " <th>prpnt_authors_name</th>\n",
776
+ " <th>prpnt_authors_id</th>\n",
777
+ " <th>prpnt_authors_orcid</th>\n",
778
+ " <th>prpnt_authors_is_corresponding</th>\n",
779
+ " </tr>\n",
780
+ " </thead>\n",
781
+ " <tbody>\n",
782
+ " <tr>\n",
783
+ " <th>0</th>\n",
784
+ " <td>https://openalex.org/W4213260597</td>\n",
785
+ " <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
786
+ " <td>Distance perception in virtual reality: A meta...</td>\n",
787
+ " <td>Distance perception in virtual reality: A meta...</td>\n",
788
+ " <td>2022</td>\n",
789
+ " <td>2022-02-12</td>\n",
790
+ " <td>en</td>\n",
791
+ " <td>preprint</td>\n",
792
+ " <td>posted-content</td>\n",
793
+ " <td>None</td>\n",
794
+ " <td>...</td>\n",
795
+ " <td>green</td>\n",
796
+ " <td>https://psyarxiv.com/6fps2/download</td>\n",
797
+ " <td>https://psyarxiv.com/6fps2/download</td>\n",
798
+ " <td>None</td>\n",
799
+ " <td>Distances are commonly underperceived in virtu...</td>\n",
800
+ " <td>first</td>\n",
801
+ " <td>Jonathan W. Kelly</td>\n",
802
+ " <td>https://openalex.org/A5011931977</td>\n",
803
+ " <td>https://orcid.org/0000-0002-4317-273X</td>\n",
804
+ " <td>True</td>\n",
805
+ " </tr>\n",
806
+ " </tbody>\n",
807
+ "</table>\n",
808
+ "<p>1 rows × 33 columns</p>\n",
809
+ "</div>"
810
+ ],
811
+ "text/plain": [
812
+ " prpnt_basic_id prpnt_basic_doi \\\n",
813
+ "0 https://openalex.org/W4213260597 https://doi.org/10.31234/osf.io/6fps2 \n",
814
+ "\n",
815
+ " prpnt_basic_title \\\n",
816
+ "0 Distance perception in virtual reality: A meta... \n",
817
+ "\n",
818
+ " prpnt_basic_display_name \\\n",
819
+ "0 Distance perception in virtual reality: A meta... \n",
820
+ "\n",
821
+ " prpnt_basic_publication_year prpnt_basic_publication_date \\\n",
822
+ "0 2022 2022-02-12 \n",
823
+ "\n",
824
+ " prpnt_basic_language prpnt_basic_type prpnt_basic_type_crossref \\\n",
825
+ "0 en preprint posted-content \n",
826
+ "\n",
827
+ " prpnt_source_journal_name ... prpnt_access_oa_status \\\n",
828
+ "0 None ... green \n",
829
+ "\n",
830
+ " prpnt_access_oa_url prpnt_access_pdf_url \\\n",
831
+ "0 https://psyarxiv.com/6fps2/download https://psyarxiv.com/6fps2/download \n",
832
+ "\n",
833
+ " prpnt_access_license prpnt_abstract \\\n",
834
+ "0 None Distances are commonly underperceived in virtu... \n",
835
+ "\n",
836
+ " prpnt_authors_position prpnt_authors_name \\\n",
837
+ "0 first Jonathan W. Kelly \n",
838
+ "\n",
839
+ " prpnt_authors_id prpnt_authors_orcid \\\n",
840
+ "0 https://openalex.org/A5011931977 https://orcid.org/0000-0002-4317-273X \n",
841
+ "\n",
842
+ " prpnt_authors_is_corresponding \n",
843
+ "0 True \n",
844
+ "\n",
845
+ "[1 rows x 33 columns]"
846
+ ]
847
+ },
848
+ "execution_count": 13,
849
+ "metadata": {},
850
+ "output_type": "execute_result"
851
+ }
852
+ ],
853
+ "source": [
854
+ "row = dataset._augmented_data_to_row(filtered_info)\n",
855
+ "row"
856
+ ]
857
+ },
858
+ {
859
+ "cell_type": "code",
860
+ "execution_count": 14,
861
+ "metadata": {},
862
+ "outputs": [
863
+ {
864
+ "name": "stderr",
865
+ "output_type": "stream",
866
+ "text": [
867
+ "100%|██████████| 5/5 [00:04<00:00, 1.02it/s]\n",
868
+ "/var/folders/kp/b80wd80s53l95yjb77jn_l0r0000gn/T/ipykernel_17064/485421214.py:140: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
869
+ " return pd.concat(output_list).reset_index(drop=True)\n"
870
+ ]
871
+ }
872
+ ],
873
+ "source": [
874
+ "df = dataset.get_full_dataset(5, full=False)"
875
+ ]
876
+ },
877
+ {
878
+ "cell_type": "code",
879
+ "execution_count": 15,
880
+ "metadata": {},
881
+ "outputs": [
882
+ {
883
+ "name": "stdout",
884
+ "output_type": "stream",
885
+ "text": [
886
+ "[['10.31234/osf.io/6fps2' '10.1109/tvcg.2022.3196606']\n",
887
+ " ['10.5194/acpd-11-3071-2011' '10.5194/acp-12-3837-2012']\n",
888
+ " ['10.1101/2020.08.07.241687' '10.1021/acscentsci.1c00703']\n",
889
+ " ['10.21203/rs.3.rs-62250/v1' '10.1016/j.vetpar.2021.109373']\n",
890
+ " ['10.21203/rs.3.rs-2640242/v1' '10.1007/s10499-023-01047-1']]\n"
891
+ ]
892
+ },
893
+ {
894
+ "data": {
895
+ "text/html": [
896
+ "<div>\n",
897
+ "<style scoped>\n",
898
+ " .dataframe tbody tr th:only-of-type {\n",
899
+ " vertical-align: middle;\n",
900
+ " }\n",
901
+ "\n",
902
+ " .dataframe tbody tr th {\n",
903
+ " vertical-align: top;\n",
904
+ " }\n",
905
+ "\n",
906
+ " .dataframe thead th {\n",
907
+ " text-align: right;\n",
908
+ " }\n",
909
+ "</style>\n",
910
+ "<table border=\"1\" class=\"dataframe\">\n",
911
+ " <thead>\n",
912
+ " <tr style=\"text-align: right;\">\n",
913
+ " <th></th>\n",
914
+ " <th>prpnt_basic_doi</th>\n",
915
+ " <th>article_basic_doi</th>\n",
916
+ " <th>label</th>\n",
917
+ " </tr>\n",
918
+ " </thead>\n",
919
+ " <tbody>\n",
920
+ " <tr>\n",
921
+ " <th>0</th>\n",
922
+ " <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
923
+ " <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
924
+ " <td>1</td>\n",
925
+ " </tr>\n",
926
+ " <tr>\n",
927
+ " <th>1</th>\n",
928
+ " <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
929
+ " <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
930
+ " <td>0</td>\n",
931
+ " </tr>\n",
932
+ " <tr>\n",
933
+ " <th>2</th>\n",
934
+ " <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
935
+ " <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
936
+ " <td>0</td>\n",
937
+ " </tr>\n",
938
+ " <tr>\n",
939
+ " <th>3</th>\n",
940
+ " <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
941
+ " <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
942
+ " <td>0</td>\n",
943
+ " </tr>\n",
944
+ " <tr>\n",
945
+ " <th>4</th>\n",
946
+ " <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
947
+ " <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
948
+ " <td>1</td>\n",
949
+ " </tr>\n",
950
+ " <tr>\n",
951
+ " <th>5</th>\n",
952
+ " <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
953
+ " <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
954
+ " <td>0</td>\n",
955
+ " </tr>\n",
956
+ " <tr>\n",
957
+ " <th>6</th>\n",
958
+ " <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
959
+ " <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
960
+ " <td>0</td>\n",
961
+ " </tr>\n",
962
+ " <tr>\n",
963
+ " <th>7</th>\n",
964
+ " <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
965
+ " <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
966
+ " <td>0</td>\n",
967
+ " </tr>\n",
968
+ " <tr>\n",
969
+ " <th>8</th>\n",
970
+ " <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
971
+ " <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
972
+ " <td>1</td>\n",
973
+ " </tr>\n",
974
+ " <tr>\n",
975
+ " <th>9</th>\n",
976
+ " <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
977
+ " <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
978
+ " <td>0</td>\n",
979
+ " </tr>\n",
980
+ " <tr>\n",
981
+ " <th>10</th>\n",
982
+ " <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
983
+ " <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
984
+ " <td>0</td>\n",
985
+ " </tr>\n",
986
+ " <tr>\n",
987
+ " <th>11</th>\n",
988
+ " <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
989
+ " <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
990
+ " <td>0</td>\n",
991
+ " </tr>\n",
992
+ " <tr>\n",
993
+ " <th>12</th>\n",
994
+ " <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
995
+ " <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
996
+ " <td>1</td>\n",
997
+ " </tr>\n",
998
+ " <tr>\n",
999
+ " <th>13</th>\n",
1000
+ " <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
1001
+ " <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
1002
+ " <td>0</td>\n",
1003
+ " </tr>\n",
1004
+ " <tr>\n",
1005
+ " <th>14</th>\n",
1006
+ " <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
1007
+ " <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
1008
+ " <td>0</td>\n",
1009
+ " </tr>\n",
1010
+ " <tr>\n",
1011
+ " <th>15</th>\n",
1012
+ " <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
1013
+ " <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
1014
+ " <td>0</td>\n",
1015
+ " </tr>\n",
1016
+ " <tr>\n",
1017
+ " <th>16</th>\n",
1018
+ " <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
1019
+ " <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
1020
+ " <td>1</td>\n",
1021
+ " </tr>\n",
1022
+ " <tr>\n",
1023
+ " <th>17</th>\n",
1024
+ " <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
1025
+ " <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
1026
+ " <td>0</td>\n",
1027
+ " </tr>\n",
1028
+ " <tr>\n",
1029
+ " <th>18</th>\n",
1030
+ " <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
1031
+ " <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
1032
+ " <td>0</td>\n",
1033
+ " </tr>\n",
1034
+ " <tr>\n",
1035
+ " <th>19</th>\n",
1036
+ " <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
1037
+ " <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
1038
+ " <td>0</td>\n",
1039
+ " </tr>\n",
1040
+ " </tbody>\n",
1041
+ "</table>\n",
1042
+ "</div>"
1043
+ ],
1044
+ "text/plain": [
1045
+ " prpnt_basic_doi \\\n",
1046
+ "0 https://doi.org/10.31234/osf.io/6fps2 \n",
1047
+ "1 https://doi.org/10.31234/osf.io/6fps2 \n",
1048
+ "2 https://doi.org/10.31234/osf.io/6fps2 \n",
1049
+ "3 https://doi.org/10.31234/osf.io/6fps2 \n",
1050
+ "4 https://doi.org/10.5194/acpd-11-3071-2011 \n",
1051
+ "5 https://doi.org/10.5194/acpd-11-3071-2011 \n",
1052
+ "6 https://doi.org/10.5194/acpd-11-3071-2011 \n",
1053
+ "7 https://doi.org/10.5194/acpd-11-3071-2011 \n",
1054
+ "8 https://doi.org/10.1101/2020.08.07.241687 \n",
1055
+ "9 https://doi.org/10.1101/2020.08.07.241687 \n",
1056
+ "10 https://doi.org/10.1101/2020.08.07.241687 \n",
1057
+ "11 https://doi.org/10.1101/2020.08.07.241687 \n",
1058
+ "12 https://doi.org/10.21203/rs.3.rs-62250/v1 \n",
1059
+ "13 https://doi.org/10.21203/rs.3.rs-62250/v1 \n",
1060
+ "14 https://doi.org/10.21203/rs.3.rs-62250/v1 \n",
1061
+ "15 https://doi.org/10.21203/rs.3.rs-62250/v1 \n",
1062
+ "16 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n",
1063
+ "17 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n",
1064
+ "18 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n",
1065
+ "19 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n",
1066
+ "\n",
1067
+ " article_basic_doi label \n",
1068
+ "0 https://doi.org/10.1109/tvcg.2022.3196606 1 \n",
1069
+ "1 https://doi.org/10.1007/s10499-023-01047-1 0 \n",
1070
+ "2 https://doi.org/10.5194/acp-12-3837-2012 0 \n",
1071
+ "3 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n",
1072
+ "4 https://doi.org/10.5194/acp-12-3837-2012 1 \n",
1073
+ "5 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n",
1074
+ "6 https://doi.org/10.1109/tvcg.2022.3196606 0 \n",
1075
+ "7 https://doi.org/10.1007/s10499-023-01047-1 0 \n",
1076
+ "8 https://doi.org/10.1021/acscentsci.1c00703 1 \n",
1077
+ "9 https://doi.org/10.1007/s10499-023-01047-1 0 \n",
1078
+ "10 https://doi.org/10.1109/tvcg.2022.3196606 0 \n",
1079
+ "11 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n",
1080
+ "12 https://doi.org/10.1016/j.vetpar.2021.109373 1 \n",
1081
+ "13 https://doi.org/10.5194/acp-12-3837-2012 0 \n",
1082
+ "14 https://doi.org/10.1109/tvcg.2022.3196606 0 \n",
1083
+ "15 https://doi.org/10.1021/acscentsci.1c00703 0 \n",
1084
+ "16 https://doi.org/10.1007/s10499-023-01047-1 1 \n",
1085
+ "17 https://doi.org/10.5194/acp-12-3837-2012 0 \n",
1086
+ "18 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n",
1087
+ "19 https://doi.org/10.1021/acscentsci.1c00703 0 "
1088
+ ]
1089
+ },
1090
+ "metadata": {},
1091
+ "output_type": "display_data"
1092
+ }
1093
+ ],
1094
+ "source": [
1095
+ "print(dois)\n",
1096
+ "display(df[['prpnt_basic_doi', 'article_basic_doi', 'label']])"
1097
+ ]
1098
+ }
1099
+ ],
1100
+ "metadata": {
1101
+ "kernelspec": {
1102
+ "display_name": "dl",
1103
+ "language": "python",
1104
+ "name": "python3"
1105
+ },
1106
+ "language_info": {
1107
+ "codemirror_mode": {
1108
+ "name": "ipython",
1109
+ "version": 3
1110
+ },
1111
+ "file_extension": ".py",
1112
+ "mimetype": "text/x-python",
1113
+ "name": "python",
1114
+ "nbconvert_exporter": "python",
1115
+ "pygments_lexer": "ipython3",
1116
+ "version": "3.9.6"
1117
+ }
1118
+ },
1119
+ "nbformat": 4,
1120
+ "nbformat_minor": 2
1121
+ }
positive_augmented_dataset.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prpnt_basic_id,prpnt_basic_doi,prpnt_basic_title,prpnt_basic_display_name,prpnt_basic_publication_year,prpnt_basic_publication_date,prpnt_basic_language,prpnt_basic_type,prpnt_basic_type_crossref,prpnt_source_journal_name,prpnt_source_issn,prpnt_source_issn_l,prpnt_source_publisher,prpnt_source_type,prpnt_metrics_cited_by_count,prpnt_metrics_is_retracted,prpnt_metrics_fwci,prpnt_metrics_referenced_works_count,prpnt_classification_primary_topic_name,prpnt_classification_primary_topic_score,prpnt_classification_primary_topic_field,prpnt_classification_primary_topic_subfield,prpnt_access_is_oa,prpnt_access_oa_status,prpnt_access_oa_url,prpnt_access_pdf_url,prpnt_access_license,prpnt_abstract,prpnt_authors_position,prpnt_authors_name,prpnt_authors_id,prpnt_authors_orcid,prpnt_authors_is_corresponding,article_basic_id,article_basic_doi,article_basic_title,article_basic_display_name,article_basic_publication_year,article_basic_publication_date,article_basic_language,article_basic_type,article_basic_type_crossref,article_source_journal_name,article_source_issn,article_source_issn_l,article_source_publisher,article_source_type,article_metrics_cited_by_count,article_metrics_is_retracted,article_metrics_fwci,article_metrics_referenced_works_count,article_classification_primary_topic_name,article_classification_primary_topic_score,article_classification_primary_topic_field,article_classification_primary_topic_subfield,article_access_is_oa,article_access_oa_status,article_access_oa_url,article_access_pdf_url,article_access_license,article_abstract,article_authors_position,article_authors_name,article_authors_id,article_authors_orcid,article_authors_is_corresponding
2
+ https://openalex.org/W2020559395,https://doi.org/10.5194/acpd-11-3071-2011,A regional chemical transport modeling to identify the influences of biomass burning during 2006 BASE-ASIA,A regional chemical transport modeling to identify the influences of biomass burning during 2006 BASE-ASIA,2011,2011-01-28,en,article,posted-content,,,,,,9,False,1.02,86,Atmospheric Aerosols and their Impacts,1.0,Earth and Planetary Sciences,Atmospheric Science,True,green,https://doi.org/10.5194/acpd-11-3071-2011,,cc-by,"Abstract. To evaluate the impact of biomass burning from Southeast Asia to East Asia, this study conducted numerical simulations during NASA's 2006 Biomass-burning Aerosols in South-East Asia: Smoke Impact Assessment (BASE-ASIA). Two typical episode periods (27–28 March and 13–14 April) were examined. Two emission inventories, FLAMBE and GFED, were used in the simulations. The influences during two episodes in the source region (Southeast Asia) contributed to CO, O3 and PM2.5 concentrations as high as 400 ppbv, 20 ppbv and 80 μg/m3, respectively. The perturbations with and without biomass burning of the above three species were in the range of 10 to 60%, 10 to 20% and 30 to 70%, respectively. The impact due to long-range transport could spread over the southeastern parts of East Asia and could reach about 160 to 360 ppbv, 8 to 18 ppbv and 8 to 64 μg/m3 on CO, O3 and PM2.5, respectively; the percentage impact could reach 20 to 50% on CO, 10 to 30% on O3, and as high as 70% on PM2.5. An impact pattern can be found in April, while the impact becomes slightly broader and goes up to Yangtze River Delta. Two cross-sections at 15° N and 20° N were used to compare the vertical flux of biomass burning. In the source region (Southeast Asia), CO, O3 and PM2.5 concentrations had a strong upward tendency from surface to high altitudes. The eastward transport becomes strong from 2 to 8 km in the free troposphere. The subsidence contributed 60 to 70%, 20 to 50%, and 80% on CO, O3 and PM2.5, respectively to surface in the downwind area. The study reveals the significant impact of Southeastern Asia biomass burning on the air quality in both local and downwind areas, particularly during biomass burning episodes. This modeling study might provide constraints of lower limit. An additional study is underway for an active biomass burning year to obtain an upper limit and climate effects.",first,Joshua S. Fu,https://openalex.org/A5036365752,https://orcid.org/0000-0001-5464-9225,False,https://openalex.org/W2042738430,https://doi.org/10.5194/acp-12-3837-2012,Evaluating the influences of biomass burning during 2006 BASE-ASIA: a regional chemical transport modeling,Evaluating the influences of biomass burning during 2006 BASE-ASIA: a regional chemical transport modeling,2012,2012-05-02,en,article,journal-article,Atmospheric chemistry and physics,1680-7316$@$1680-7324,1680-7316,Copernicus Publications,journal,111,False,5.993,68,Atmospheric chemistry and aerosols,1.0,Earth and Planetary Sciences,Atmospheric Science,True,gold,https://acp.copernicus.org/articles/12/3837/2012/acp-12-3837-2012.pdf,https://acp.copernicus.org/articles/12/3837/2012/acp-12-3837-2012.pdf,cc-by,"Abstract. To evaluate the impact of biomass burning from Southeast Asia to East Asia, this study conducted numerical simulations during NASA's 2006 Biomass-burning Aerosols in South-East Asia: Smoke Impact Assessment (BASE-ASIA). Two typical episode periods (27–28 March and 13–14 April) were examined. Two emission inventories, FLAMBE and GFED, were used in the simulations. The influences during two episodes in the source region (Southeast Asia) contributed to the surface CO, O3 and PM2.5 concentrations as high as 400 ppbv, 20 ppbv and 80 μg m−3, respectively. The perturbations with and without biomass burning of the above three species during the intense episodes were in the range of 10 to 60%, 10 to 20% and 30 to 70%, respectively. The impact due to long-range transport could spread over the southeastern parts of East Asia and could reach about 160 to 360 ppbv, 8 to 18 ppbv and 8 to 64 μg m−3 on CO, O3 and PM2.5, respectively; the percentage impact could reach 20 to 50% on CO, 10 to 30% on O3, and as high as 70% on PM2.5. In March, the impact of biomass burning mainly concentrated in Southeast Asia and southern China, while in April the impact becomes slightly broader and even could go up to the Yangtze River Delta region. Two cross-sections at 15° N and 20° N were used to compare the vertical flux of biomass burning. In the source region (Southeast Asia), CO, O3 and PM2.5 concentrations had a strong upward transport from surface to high altitudes. The eastward transport becomes strong from 2 to 8 km in the free troposphere. The subsidence process during the long-range transport contributed 60 to 70%, 20 to 50%, and 80% on CO, O3 and PM2.5, respectively to surface in the downwind area. The study reveals the significant impact of Southeastern Asia biomass burning on the air quality in both local and downwind areas, particularly during biomass burning episodes. This modeling study might provide constraints of lower limit. An additional study is underway for an active biomass burning year to obtain an upper limit and climate effects.",first,Joshua S. Fu,https://openalex.org/A5036365752,https://orcid.org/0000-0001-5464-9225,False
3
+ https://openalex.org/W4213260597,https://doi.org/10.31234/osf.io/6fps2,Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics,Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics,2022,2022-02-12,en,preprint,posted-content,,,,,,6,False,,89,Virtual Reality Applications and Impacts,0.9982,Computer Science,Human-Computer Interaction,True,green,https://psyarxiv.com/6fps2/download,https://psyarxiv.com/6fps2/download,,"Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis of 131 studies describes egocentric distance perception across 20 HMDs, and also examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.",first,Jonathan W. Kelly,https://openalex.org/A5011931977,https://orcid.org/0000-0002-4317-273X,True,https://openalex.org/W4289824348,https://doi.org/10.1109/tvcg.2022.3196606,Distance Perception in Virtual Reality: A Meta-Analysis of the Effect of Head-Mounted Display Characteristics,Distance Perception in Virtual Reality: A Meta-Analysis of the Effect of Head-Mounted Display Characteristics,2022,2022-08-04,en,review,journal-article,IEEE Transactions on Visualization and Computer Graphics,1077-2626$@$1941-0506$@$2160-9306,1077-2626,Institute of Electrical and Electronics Engineers,journal,40,False,2.128,96,Virtual Reality Applications and Impacts,0.9997,Computer Science,Human-Computer Interaction,True,green,https://osf.io/6fps2/download,,,"Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis, based on 137 samples from 61 publications, describes egocentric distance perception across 20 HMDs and examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.",first,Jonathan W. Kelly,https://openalex.org/A5011931977,https://orcid.org/0000-0002-4317-273X,True
4
+ https://openalex.org/W3048189011,https://doi.org/10.1101/2020.08.07.241687,Bioorthogonal red and far-red fluorogenic probes for wash-free live-cell and super-resolution microscopy,Bioorthogonal red and far-red fluorogenic probes for wash-free live-cell and super-resolution microscopy,2020,2020-08-07,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,9,False,,46,Click Chemistry in Chemical Biology and Drug Development,0.9999,Chemistry,Organic Chemistry,True,green,https://doi.org/10.1101/2020.08.07.241687,,cc-by-nc-nd,"Abstract Small-molecule fluorophores enable the observation of biomolecules in their native context with fluorescence microscopy. Specific labelling via bioorthogonal tetrazine chemistry confers minimal label size and rapid labelling kinetics. At the same time, fluorogenic tetrazine-dye conjugates exhibit efficient quenching of dyes prior to target binding. However, live-cell compatible long-wavelength fluorophores with strong fluorogenicity have been difficult to realize. Here, we report close proximity tetrazine-dye conjugates with minimal distance between tetrazine and fluorophore. Two synthetic routes give access to a series of cell permeable and impermeable dyes including highly fluorogenic far-red emitting derivatives with electron exchange as dominant excited state quenching mechanism. We demonstrate their potential for live-cell imaging in combination with unnatural amino acids, wash-free multi-colour and super-resolution STED and SOFI imaging. These dyes pave the way for advanced fluorescence imaging of biomolecules with minimal label size.",first,Philipp Werther,https://openalex.org/A5087410446,https://orcid.org/0000-0003-1267-5614,False,https://openalex.org/W3193691807,https://doi.org/10.1021/acscentsci.1c00703,Bio-orthogonal Red and Far-Red Fluorogenic Probes for Wash-Free Live-Cell and Super-resolution Microscopy,Bio-orthogonal Red and Far-Red Fluorogenic Probes for Wash-Free Live-Cell and Super-resolution Microscopy,2021,2021-08-20,en,article,journal-article,ACS Central Science,2374-7943$@$2374-7951,2374-7943,American Chemical Society,journal,80,False,7.957,46,Click Chemistry and Applications,0.9999,Chemistry,Organic Chemistry,True,diamond,https://pubs.acs.org/doi/pdf/10.1021/acscentsci.1c00703,https://pubs.acs.org/doi/pdf/10.1021/acscentsci.1c00703,cc-by,"Small-molecule fluorophores enable the observation of biomolecules in their native context with fluorescence microscopy. Specific labeling via bio-orthogonal tetrazine chemistry combines minimal label size with rapid labeling kinetics. At the same time, fluorogenic tetrazine–dye conjugates exhibit efficient quenching of dyes prior to target binding. However, live-cell compatible long-wavelength fluorophores with strong fluorogenicity have been difficult to realize. Here, we report close proximity tetrazine–dye conjugates with minimal distance between tetrazine and the fluorophore. Two synthetic routes give access to a series of cell-permeable and -impermeable dyes including highly fluorogenic far-red emitting derivatives with electron exchange as the dominant excited-state quenching mechanism. We demonstrate their potential for live-cell imaging in combination with unnatural amino acids, wash-free multicolor and super-resolution STED, and SOFI imaging. These dyes pave the way for advanced fluorescence imaging of biomolecules with minimal label size.",first,Philipp Werther,https://openalex.org/A5087410446,https://orcid.org/0000-0003-1267-5614,False
5
+ https://openalex.org/W4244952185,https://doi.org/10.21203/rs.3.rs-62250/v1,Towards Understanding the Liver Fluke Transmission Dynamics on Farms: Detection of Liver Fluke Transmitting Snail and Liver Fluke-Specific Environmental DNA in Water Samples from an Irrigated Dairy Farm in Southeast Australia,Towards Understanding the Liver Fluke Transmission Dynamics on Farms: Detection of Liver Fluke Transmitting Snail and Liver Fluke-Specific Environmental DNA in Water Samples from an Irrigated Dairy Farm in Southeast Australia,2020,2020-08-24,en,preprint,posted-content,Research Square (Research Square),,,Research Square (United States),repository,0,False,,34,Helminth infection and control,0.9875,Veterinary,Small Animals,True,green,https://www.researchsquare.com/article/rs-62250/latest.pdf,https://www.researchsquare.com/article/rs-62250/latest.pdf,cc-by,"Abstract Background Livestock production around the world is impacted by liver fluke ( Fasciola spp.) infection resulting in serious economic losses to the beef, dairy and sheep industries with significant losses of about $90 million per annum in Australia. Liver fluke infection is predominantly controlled by anthelmintic treatment and Triclabendazole (TCBZ) is usually the drug of choice due its superior efficacy against early immature, immature and adult liver fluke stages; however, the widespread emergence of TCBZ resistance in livestock threatens liver fluke control. We are in the urgent need for alternative control measures to lower the exposure of livestock to liver fluke infection which would help to preserve the usefulness of current anthelmintic treatments. Our ability to understand the prevalence of intermediate snail hosts and infective liver fluke stages in the environment is crucial to implement alternative control measures for liver fluke control. However, identification of liver fluke and snails in the environment is hampered by lack of efficient diagnostic methods. Environmental DNA (eDNA) based identification of liver fluke and the intermediate snail host in the water bodies is a promising method to identify liver fluke and snail prevalence on farms. Our aim is to provide a proof of concept to use a molecular tool (quantitative PCR) to detect and quantify eDNA of liver fluke and snail in water bodies on Victorian farming properties for potential large-scale analysis of liver fluke and snail ecology in water bodies. Methods To demonstrate the identification of liver fluke and snail in water bodies, we used a multiplex quantitative PCR assay for the independent but simultaneous detection of eDNA released from snail ( Austropeplea tomentosa) a crucial intermediate snail host for liver fluke transmission in South-east Australia and free-living liver fluke stages ( Fasciola hepatica) . We have collected water samples from an irrigation channel over a period of 11 months in 2016 at a dairy farm located at Maffra, Victoria, South-east Australia and used water samples from selected months (February, March, May, September, October, November and December) for eDNA assay. Results The multiplex qPCR assay effectively allows for the detection and quantification of eDNA released from liver fluke life stages and snails and we observed differential levels of liver fluke and snail specific eDNA in water at the time points analysed in this study. This assay was able to detect 14 fg and 50 pg of liver fluke and snail DNA in the presence of potential inhibitors from field collected water samples. Conclusion The successful detection of eDNA specific to liver fluke and snails from the field collected water samples provides a proof of concept for the use of this method as a monitoring tool to determine the prevalence of liver fluke and liver fluke-transmitting snails in irrigation regions to allow for understanding the liver fluke transmission zones on farms to implement effective control strategies.",first,Vignesh Rathinasamy,https://openalex.org/A5065524774,https://orcid.org/0000-0002-4032-3424,False,https://openalex.org/W3126899836,https://doi.org/10.1016/j.vetpar.2021.109373,Towards understanding the liver fluke transmission dynamics on farms: Detection of liver fluke transmitting snail and liver fluke-specific environmental DNA in water samples from an irrigated dairy farm in Southeast Australia,Towards understanding the liver fluke transmission dynamics on farms: Detection of liver fluke transmitting snail and liver fluke-specific environmental DNA in water samples from an irrigated dairy farm in Southeast Australia,2021,2021-02-03,en,article,journal-article,Veterinary Parasitology,0304-4017$@$1873-2550,0304-4017,Elsevier BV,journal,16,False,1.892,36,Environmental DNA in Biodiversity Studies,0.997,Environmental Science,Ecology,True,green,https://www.researchsquare.com/article/rs-62250/latest.pdf,,,,first,Vignesh Rathinasamy,https://openalex.org/A5065524774,https://orcid.org/0000-0002-4032-3424,False
6
+ https://openalex.org/W4324046272,https://doi.org/10.21203/rs.3.rs-2640242/v1,Immunophysiology of tambaqui fed with different levels of dietary protein in a biofloc system and a clear water system,Immunophysiology of tambaqui fed with different levels of dietary protein in a biofloc system and a clear water system,2023,2023-03-13,en,preprint,posted-content,Research Square (Research Square),,,Research Square (United States),repository,0,False,,25,Aquaculture disease management and microbiota,0.9985,Immunology and Microbiology,Immunology,True,green,https://www.researchsquare.com/article/rs-2640242/latest.pdf,https://www.researchsquare.com/article/rs-2640242/latest.pdf,cc-by,"Abstract The present study evaluated the immunophysiological response in Colossoma macropomum fed with different levels of dietary protein in a biofloc system (BFS) and in clear water (CW) and under infection with Aeromonas jandaei . Juvenile tambaqui (9.20 ± 0.23 g) were fed isolipid feed with three levels of crude protein (CP) in the two production systems: BFS24, BFS28 and BFS32 and CW24, CW28 and CW32 with 24, 28 and 32% CP respectively, for 60 days. At the end of the experimental period, the physiological conditions (hematology, biochemistry, hormonal and oxidative stress) of the fish were analyzed. The results of erythrogram, cortisol, glycemia and serum biochemistry (p &gt; 0.05) of the fish did not show significant differences between the breeding systems (BFS and CW) and the different protein levels. Tambaqui raised in the BFS showed monocytosis, thrombocytosis and higher respiratory activity of leukocytes, as well as higher glutathione (GSH) and lower malondialdehyde (MDA) values (p &lt; 0.05). In the bacterial challenge, after induction of aeromonosis, caused by Aeromonas jandaei , greater survival of fish raised in the BFS was observed. The results suggest that, even at lower protein levels, tambaqui maintain physiological homeostasis and, therefore, it may be possible to use up to 24% CP in the diet in the biofloc system. In addition, after the bacterial infection, this system promoted greater immunological resistance in the fish.",first,Michelle Midori Sena Fugimura,https://openalex.org/A5059890817,https://orcid.org/0000-0002-1354-2277,False,https://openalex.org/W4317396516,https://doi.org/10.1007/s10499-023-01047-1,Dietary protein requirement for tambaqui cultivated in biofloc and clear water systems,Dietary protein requirement for tambaqui cultivated in biofloc and clear water systems,2023,2023-01-18,en,article,journal-article,Aquaculture International,0967-6120$@$1573-143X,0967-6120,Springer Science+Business Media,journal,7,False,4.321,50,Metabolism and Nutrition in Aquaculture Feeds,0.9999,Agricultural and Biological Sciences,Aquatic Science,False,closed,,,,,first,Raphael Brito dos Santos,https://openalex.org/A5051287609,https://orcid.org/0000-0003-2168-8759,False
7
+ https://openalex.org/W2612690603,https://doi.org/10.1101/135053,Attention is required for knowledge-based sequential grouping of syllables into words,Attention is required for knowledge-based sequential grouping of syllables into words,2017,2017-05-08,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,0,False,,72,Neural dynamics and brain function,0.9991,Neuroscience,Cognitive Neuroscience,True,green,https://www.biorxiv.org/content/biorxiv/early/2017/05/08/135053.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2017/05/08/135053.full.pdf,cc-by,"Abstract How the brain sequentially groups sensory events into temporal chunks and how this process is modulated by attention are fundamental questions in cognitive neuroscience. Sequential grouping includes bottom-up primitive grouping and top-down knowledge-based grouping. In speech perception, grouping acoustic features into syllables can rely on bottom-up acoustic continuity cues but grouping syllables into words critically relies on the listener’s lexical knowledge. This study investigates whether top-down attention is required to apply lexical knowledge to group syllables into words, by concurrently monitoring neural entrainment to syllables and words using electroencephalography (EEG). When attention is directed to a competing speech stream or cross-modally to a silent movie, neural entrainment to syllables is weakened but neural entrainment to words largely diminishes. These results strongly suggest that knowledge-based grouping of syllables into words requires top-down attention and is a bottleneck for the neural processing of unattended speech.",first,Nai Ding,https://openalex.org/A5008847016,https://orcid.org/0000-0003-3428-2723,False,https://openalex.org/W2778438370,https://doi.org/10.1523/jneurosci.2606-17.2017,Attention Is Required for Knowledge-Based Sequential Grouping: Insights from the Integration of Syllables into Words,Attention Is Required for Knowledge-Based Sequential Grouping: Insights from the Integration of Syllables into Words,2017,2017-12-18,en,article,journal-article,Journal of Neuroscience,0270-6474$@$1529-2401,0270-6474,Society for Neuroscience,journal,80,False,3.31,86,EEG and Brain-Computer Interfaces,0.9986,Neuroscience,Cognitive Neuroscience,True,hybrid,https://www.jneurosci.org/content/jneuro/38/5/1178.full.pdf,https://www.jneurosci.org/content/jneuro/38/5/1178.full.pdf,cc-by-nc-sa,"How the brain groups sequential sensory events into chunks is a fundamental question in cognitive neuroscience. This study investigates whether top–down attention or specific tasks are required for the brain to apply lexical knowledge to group syllables into words. Neural responses tracking the syllabic and word rhythms of a rhythmic speech sequence were concurrently monitored using electroencephalography (EEG). The participants performed different tasks, attending to either the rhythmic speech sequence or a distractor, which was another speech stream or a nonlinguistic auditory/visual stimulus. Attention to speech, but not a lexical-meaning-related task, was required for reliable neural tracking of words, even when the distractor was a nonlinguistic stimulus presented cross-modally. Neural tracking of syllables, however, was reliably observed in all tested conditions. These results strongly suggest that neural encoding of individual auditory events (i.e., syllables) is automatic, while knowledge-based construction of temporal chunks (i.e., words) crucially relies on top–down attention. SIGNIFICANCE STATEMENT Why we cannot understand speech when not paying attention is an old question in psychology and cognitive neuroscience. Speech processing is a complex process that involves multiple stages, e.g., hearing and analyzing the speech sound, recognizing words, and combining words into phrases and sentences. The current study investigates which speech-processing stage is blocked when we do not listen carefully. We show that the brain can reliably encode syllables, basic units of speech sounds, even when we do not pay attention. Nevertheless, when distracted, the brain cannot group syllables into multisyllabic words, which are basic units for speech meaning. Therefore, the process of converting speech sound into meaning crucially relies on attention.",first,Nai Ding,https://openalex.org/A5008847016,https://orcid.org/0000-0003-3428-2723,False
8
+ https://openalex.org/W2949817526,https://doi.org/10.1101/037101,Read-Based Phasing of Related Individuals,Read-Based Phasing of Related Individuals,2016,2016-01-18,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,7,False,,26,Text Readability and Simplification,0.9512,Computer Science,Artificial Intelligence,True,green,https://www.biorxiv.org/content/biorxiv/early/2016/01/18/037101.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2016/01/18/037101.full.pdf,,"Abstract Motivation Read-based phasing deduces the haplotypes of an individual from sequencing reads that cover multiple variants, while genetic phasing takes only genotypes as input and applies the rules of Mendelian inheritance to infer haplotypes within a pedigree of individuals. Combining both into an approach that uses these two independent sources of information - reads and pedigree - has the potential to deliver results better than each individually. Results We provide a theoretical framework combining read-based phasing with genetic haplotyping, and describe a fixed-parameter algorithm and its implementation for finding an optimal solution. We show that leveraging reads of related individuals jointly in this way yields more phased variants and at a higher accuracy than when phased separately, both in simulated and real data. Coverages as low as 2× for each member of a trio yield haplotypes that are as accurate as when analyzed separately at 15× coverage per individual. Availability https://bitbucket.org/whatshap/whatshap (branch pedmec) Contact [email protected]",first,Shilpa Garg,https://openalex.org/A5060605357,https://orcid.org/0000-0003-0200-4200,False,https://openalex.org/W2420821447,https://doi.org/10.1093/bioinformatics/btw276,Read-based phasing of related individuals,Read-based phasing of related individuals,2016,2016-06-11,en,article,journal-article,Bioinformatics,1367-4803$@$1367-4811,1367-4803,Oxford University Press,journal,44,False,7.736,28,Text Readability and Simplification,0.9452,Computer Science,Artificial Intelligence,True,hybrid,https://academic.oup.com/bioinformatics/article-pdf/32/12/i234/6695672/btw276.pdf,https://academic.oup.com/bioinformatics/article-pdf/32/12/i234/6695672/btw276.pdf,cc-by-nc,"Read-based phasing deduces the haplotypes of an individual from sequencing reads that cover multiple variants, while genetic phasing takes only genotypes as input and applies the rules of Mendelian inheritance to infer haplotypes within a pedigree of individuals. Combining both into an approach that uses these two independent sources of information-reads and pedigree-has the potential to deliver results better than each individually.We provide a theoretical framework combining read-based phasing with genetic haplotyping, and describe a fixed-parameter algorithm and its implementation for finding an optimal solution. We show that leveraging reads of related individuals jointly in this way yields more phased variants and at a higher accuracy than when phased separately, both in simulated and real data. Coverages as low as 2× for each member of a trio yield haplotypes that are as accurate as when analyzed separately at 15× coverage per individual.https://bitbucket.org/whatshap/[email protected].",first,Shilpa Garg,https://openalex.org/A5060605357,https://orcid.org/0000-0003-0200-4200,False
9
+ https://openalex.org/W3163880707,https://doi.org/10.1101/2020.08.10.231720,Cell-specific imputation of drug connectivity mapping with incomplete data,Cell-specific imputation of drug connectivity mapping with incomplete data,2020,2020-08-10,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,4,False,,33,Cell Image Analysis Techniques,0.999,"Biochemistry, Genetics and Molecular Biology",Biophysics,True,green,https://doi.org/10.1101/2020.08.10.231720,,cc-by-nc,"ABSTRACT Motivation Drug repositioning allows expedited discovery of new applications for existing compounds, but re-screening vast compound libraries is often prohibitively expensive. “Connectivity mapping” is a process that links drugs to diseases by identifying compounds whose impact on expression in a collection of cells reverses the disease’s impact on expression in disease-relevant tissues. The high throughput LINCS project has expanded the universe of compounds and cell types for which data are available, but even with this effort, many potentially clinically useful combinations are missing. To evaluate the possibility of repurposing drugs this way despite missing data, we compared collaborative filtering with either neighborhood-based or SVD imputation methods to two naive approaches via cross-validation. Results Methods were evaluated for their ability to predict drug connectivity despite missing data. Predictions improved when cell type was taken into account. Neighborhood-based collaborative filtering was the most successful method, with the best improvements in non-immortalized primary cells. We also explored which classes of compounds are most and least reliant on cell type for accurate imputation, and we identified connections between related compounds even when many were not measured in the relevant cells. We conclude that even for cells in which drug responses have not been fully characterized, it is possible to identify unassayed drugs that reverse in those cells the expression signatures observed in disease. Contact [email protected]",first,Diana Sapashnik,https://openalex.org/A5041230053,,False,https://openalex.org/W4321003439,https://doi.org/10.1371/journal.pone.0278289,Cell-specific imputation of drug connectivity mapping with incomplete data,Cell-specific imputation of drug connectivity mapping with incomplete data,2023,2023-02-16,en,article,journal-article,PLoS ONE,1932-6203,1932-6203,Public Library of Science,journal,1,False,0.449,33,Computational Methods in Drug Discovery,0.9992,Computer Science,Computational Theory and Mathematics,True,gold,https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0278289&type=printable,https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0278289&type=printable,public-domain,"Drug repositioning allows expedited discovery of new applications for existing compounds, but re-screening vast compound libraries is often prohibitively expensive. “Connectivity mapping” is a process that links drugs to diseases by identifying compounds whose impact on expression in a collection of cells reverses the disease’s impact on expression in disease-relevant tissues. The LINCS project has expanded the universe of compounds and cells for which data are available, but even with this effort, many clinically useful combinations are missing. To evaluate the possibility of repurposing drugs despite missing data, we compared collaborative filtering using either neighborhood-based or SVD imputation methods to two naive approaches via cross-validation. Methods were evaluated for their ability to predict drug connectivity despite missing data. Predictions improved when cell type was taken into account. Neighborhood collaborative filtering was the most successful method, with the best improvements in non-immortalized primary cells. We also explored which classes of compounds are most and least reliant on cell type for accurate imputation. We conclude that even for cells in which drug responses have not been fully characterized, it is possible to identify unassayed drugs that reverse in those cells the expression signatures observed in disease.",first,Diana Sapashnik,https://openalex.org/A5041230053,,True
10
+ https://openalex.org/W4323306896,https://doi.org/10.36227/techrxiv.22184071.v1,Embedded Pressure Sensing Metamaterials using TPU-Graphene Composites and Additive Manufacturing,Embedded Pressure Sensing Metamaterials using TPU-Graphene Composites and Additive Manufacturing,2023,2023-03-06,en,preprint,posted-content,,,,,,0,False,,52,Advanced Sensor and Energy Harvesting Materials,0.9998,Engineering,Biomedical Engineering,True,green,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,cc-by,"&lt;p&gt;Nearly 15% of the global population is affected by disabilities impacting mobility. Monitoring foot pressure distribution during gait is a fundamental aspect of evaluating rehabilitation. Wearable systems provide a portable alternative to stationary equipment monitoring gait without laboratory space limitations. However, wearable sensors in some applications present challenges in the calibration, sensitivity, and human-sensor interface, requiring application-specific sensors. This study aimed to develop wearable sensors where the structural and material properties can characterise the sensitivity and range of measurement during the design phase. We developed wearable piezoresistive sensors using additive manufacturing to create mechanical metamaterials with embedded pressure-sensing capabilities. The sensors were fabricated in TPU using SLS and graphene ink infusion processes. Three structural designs were developed for different measuring ranges (0 – 50 N, 0 – 100 N, and 0 – 150 N) using body-centred cubic lattices constructed via pyramid unit cells. Two graphene infusion processes were evaluated. We tested the sensors' mechanical and piezoresistive behaviour, measuring the compressive force, strain, and electrical resistance across the sensor. We analysed the influence of structural dimensions and the infusion process on the piezoresistive behaviour. The measuring range was affected mainly by tuneable structural dimensions. The infusion process influenced the piezoresistive sensitivity and affected the linearity response. The results indicate the characterisation of the sensitivity of piezoresistive sensors based on structural parameters and material properties. Mechanical metamaterials could embed pressure sensing in wearables, allowing for customisation based on design parameters using additive manufacture and graphene inks. &lt;/p&gt;",first,Inigo Sanz Pena,https://openalex.org/A5041979699,,True,https://openalex.org/W4380303556,https://doi.org/10.1109/jsen.2023.3283460,Embedded Pressure Sensing Metamaterials Using TPU-Graphene Composites and Additive Manufacturing,Embedded Pressure Sensing Metamaterials Using TPU-Graphene Composites and Additive Manufacturing,2023,2023-06-12,en,article,journal-article,IEEE Sensors Journal,1530-437X$@$1558-1748,1530-437X,IEEE Sensors Council,journal,4,False,1.025,54,Wearable Nanogenerator Technology,0.9999,Engineering,Biomedical Engineering,True,green,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,,,"Disabilities impacting mobility are a global concern requiring gait rehabilitation, where monitoring foot pressure distribution is fundamental. Wearable systems provide an alternative to stationary equipment eliminating space limitations. However, wearable sensors present challenges in the calibration, sensitivity, and human–sensor interface, requiring application-specific sensors. This study aimed to develop wearable sensors, where the structural and material properties can characterize the sensitivity and range of measurement during the design phase. We developed wearable piezoresistive sensors using additive manufacturing (AM) to create mechanical metamaterials with embedded pressure-sensing capabilities. Three structural designs were developed for different measuring ranges (0–50 N, 0–100 N, and 0–150 N) using body-centered cubic (BCC) lattices constructed via pyramid unit cells. In addition, two graphene infusion processes were evaluated. We analyzed the influence of structural dimensions and the graphene infusion process on the piezoresistive response of the sensors. The measuring range was affected mainly by tunable structural dimensions, while the infusion process influenced the piezoresistive sensitivity and the linear response. The outcomes in characterizing the piezoresistive sensors based on structural and material properties could allow the development of wearables with embedded pressure sensing with a predictive response solely based on design parameters using AM and graphene inks.",first,Inigo Sanz-Pena,https://openalex.org/A5027816754,https://orcid.org/0000-0002-8282-0648,False
11
+ https://openalex.org/W3212535440,https://doi.org/10.1101/2021.11.16.468842,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,2021,2021-11-18,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,0,False,,28,Mitochondrial Function and Pathology,1.0,"Biochemistry, Genetics and Molecular Biology",Molecular Biology,True,green,https://www.biorxiv.org/content/biorxiv/early/2022/02/03/2021.11.16.468842.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2022/02/03/2021.11.16.468842.full.pdf,cc-by,"Abstract A sudden increase in permeability of the mitochondrial inner membrane, mitochondrial permeability transition (PT), is the central event responsible for cell death and tissue damage in conditions such as stroke and heart attack. PT is caused by the opening of the Cyclosporin A (CSA) dependent calcium-induced pore, the Permeability Transition Pore (PTP). The molecular details of PTP are incompletely understood. We utilized a combination of holographic and fluorescent microscopy to assess the contribution of the ATP synthase and Adenine Nucleotide Translocator (ANT) towards PTP. In cells lacking either ATP synthase or ANT, we observed CSA-sensitive membrane depolarization, but not high-conductance PTP. Further, we found that in wild-type cells calcium induced CSA-sensitive depolarization precedes opening of the PTP, which occurred until after nearly complete mitochondrial membrane depolarization. We propose that both ATP synthase and ANT are required for high conductance PTP but not depolarization, which presumably occurs through activation of the low conductance PT, which has a molecular nature that is different from both complexes.",first,Maria Neginskaya,https://openalex.org/A5011920727,https://orcid.org/0000-0001-8490-5218,True,https://openalex.org/W4307726921,https://doi.org/10.1016/j.isci.2022.105447,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,2022,2022-10-28,en,article,journal-article,iScience,2589-0042,2589-0042,Cell Press,journal,24,False,3.22,47,ATP Synthase and ATPases Research,1.0,"Biochemistry, Genetics and Molecular Biology",Molecular Biology,True,gold,https://www.cell.com/article/S2589004222017199/pdf,https://www.cell.com/article/S2589004222017199/pdf,cc-by,"An increase in permeability of the mitochondrial inner membrane, mitochondrial permeability transition (PT), is the central event responsible for cell death and tissue damage in conditions such as stroke and heart attack. PT is caused by the cyclosporin A (CSA)-dependent calcium-induced pore, the permeability transition pore (PTP). The molecular details of PTP are incompletely understood. We utilized holographic and fluorescent microscopy to assess the contribution of ATP synthase and adenine nucleotide translocator (ANT) toward PTP. In cells lacking either ATP synthase or ANT, we observed CSA-sensitive membrane depolarization, but not high-conductance PTP. In wild-type cells, calcium-induced CSA-sensitive depolarization preceded opening of PTP, which occurred only after nearly complete mitochondrial membrane depolarization. We propose that both ATP synthase and ANT are required for high-conductance PTP but not depolarization, which presumably occurs through activation of the low-conductance PT, which has a molecular nature that is different from both complexes.",first,Maria Neginskaya,https://openalex.org/A5011920727,https://orcid.org/0000-0001-8490-5218,True
requirements.txt ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.7.0
4
+ certifi==2024.8.30
5
+ charset-normalizer==3.4.0
6
+ click==8.1.7
7
+ ConfigArgParse==1.7
8
+ contourpy==1.3.0
9
+ cycler==0.12.1
10
+ diffdist==0.1
11
+ exceptiongroup==1.2.2
12
+ fastapi==0.115.6
13
+ ffmpy==0.4.0
14
+ filelock==3.16.1
15
+ fonttools==4.55.3
16
+ fsspec==2024.10.0
17
+ gitdb==4.0.11
18
+ GitPython==3.1.43
19
+ gradio==4.44.1
20
+ gradio_client==1.3.0
21
+ h11==0.14.0
22
+ httpcore==1.0.7
23
+ httpx==0.28.1
24
+ huggingface-hub==0.26.5
25
+ idna==3.10
26
+ importlib_resources==6.4.5
27
+ iniconfig==2.0.0
28
+ Jinja2==3.1.4
29
+ joblib==1.4.2
30
+ kiwisolver==1.4.7
31
+ markdown-it-py==3.0.0
32
+ MarkupSafe==2.1.5
33
+ matplotlib==3.9.3
34
+ mdurl==0.1.2
35
+ mpmath==1.3.0
36
+ networkx==3.2.1
37
+ nltk==3.9.1
38
+ numpy==2.0.2
39
+ orjson==3.10.12
40
+ packaging==24.2
41
+ pandas==2.2.3
42
+ pillow==10.4.0
43
+ pluggy==1.5.0
44
+ pyalex==0.15.1
45
+ pydantic==2.10.3
46
+ pydantic_core==2.27.1
47
+ pydub==0.25.1
48
+ Pygments==2.18.0
49
+ pyparsing==3.2.0
50
+ pytest==8.3.4
51
+ python-dateutil==2.9.0.post0
52
+ python-multipart==0.0.19
53
+ pytz==2024.2
54
+ PyYAML==6.0.2
55
+ regex==2024.11.6
56
+ requests==2.32.3
57
+ rich==13.9.4
58
+ ruff==0.8.2
59
+ scikit-learn==1.5.2
60
+ scipy==1.13.1
61
+ semantic-version==2.10.0
62
+ shellingham==1.5.4
63
+ six==1.16.0
64
+ smmap==5.0.1
65
+ sniffio==1.3.1
66
+ starlette==0.41.3
67
+ strconv==0.4.2
68
+ sympy==1.13.1
69
+ tabulate==0.9.0
70
+ tenacity==9.0.0
71
+ threadpoolctl==3.5.0
72
+ tomli==2.2.1
73
+ tomlkit==0.12.0
74
+ torch==2.5.1
75
+ torchaudio==2.5.1
76
+ torchvision==0.20.1
77
+ tqdm==4.67.1
78
+ typer==0.15.1
79
+ typing_extensions==4.12.2
80
+ tzdata==2024.2
81
+ urllib3==2.2.3
82
+ uvicorn==0.32.1
83
+ websockets==12.0
84
+ zipp==3.21.0
run_augmenter.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from src.utils.io_utils import PROJECT_ROOT
3
+ from src.dataset.GoodDataset import AugmentedDataset
4
+ from src.dataset.NegativeSampler import NegativeSampler
5
+ from src.utils.struct_utils import *
6
+ import os
7
+
8
+ class Config:
9
+ input = os.path.join(PROJECT_ROOT, "data/positive_samples.pkl")
10
+ output = os.path.join(PROJECT_ROOT, "data/negative_samples.pkl")
11
+
12
+ seed=42
13
+
14
+ random=True
15
+ fuzz_title=True
16
+ replace_auth=True
17
+ overlap_auth=False
18
+ overlap_topic=False
19
+
20
+ factor_max=4
21
+ authors_to_consider=1
22
+ overlapping_authors=1
23
+ fuzz_count=1
24
+
25
+ def negative_sampler(optional_path = None, factor = None, type_or_difficulty = None)-> pd.DataFrame:
26
+ datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv"
27
+ # return pd.read_csv(datapath)
28
+ dataset = AugmentedDataset()
29
+ # datapath = '../data/pos.csv'
30
+ dataset.load_csv(datapath)
31
+
32
+ sampler = NegativeSampler(dataset)
33
+ config = Config()
34
+ sampler.create_negative_samples(config)
35
+
36
+ return custom_struct_to_df(dataset.negative_samples)
37
+
38
+ def positive_sampler(optional_path=None, size=10, random=True, seed=42, full=False):
39
+ datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv"
40
+ dataset = AugmentedDataset(datapath)
41
+ dataset.fetch_positive_samples_parallel(
42
+ num_samples=size,
43
+ random=random,
44
+ seed=seed,
45
+ full=full
46
+ )
47
+
48
+ return custom_struct_to_df(dataset.positive_samples)
scrap.txt ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Template for OpenAlex with all features enabled
2
+ alex_template = OpenAlexKeys(
3
+ basic={
4
+ "id": True,
5
+ "doi": True,
6
+ "title": True,
7
+ "display_name": True,
8
+ "publication_year": True,
9
+ "publication_date": True,
10
+ "language": True,
11
+ "type": True,
12
+ "type_crossref": True
13
+ },
14
+ source={
15
+ "journal_name": True,
16
+ "issn": True,
17
+ "issn_l": True,
18
+ "publisher": True,
19
+ "type": True
20
+ },
21
+ authors={
22
+ "position": True,
23
+ "name": True,
24
+ "id": True,
25
+ "orcid": True,
26
+ "is_corresponding": True,
27
+ "affiliations": True
28
+ },
29
+ metrics={
30
+ "cited_by_count": True,
31
+ "cited_by_percentile": True,
32
+ "is_retracted": True,
33
+ "fwci": True,
34
+ "referenced_works_count": True
35
+ },
36
+ classification={
37
+ "primary_topic": True,
38
+ "topics": True,
39
+ "concepts": True,
40
+ },
41
+ access={
42
+ "is_oa": True,
43
+ "oa_status": True,
44
+ "oa_url": True,
45
+ "pdf_url": True,
46
+ "license": True
47
+ },
48
+ related_works={
49
+ "references": True,
50
+ "referenced_by_count": True,
51
+ "related": True
52
+ },
53
+ abstract=True
54
+ )
55
+
56
+ # Template for Elsevier with all features enabled
57
+ elsevier_template = ElsevierKeys(
58
+ basic={
59
+ "title": True,
60
+ "doi": True,
61
+ "publication_name": True,
62
+ "pub_type": True,
63
+ "publication_date": True
64
+ },
65
+ biblio={
66
+ "volume": True,
67
+ "issue": True,
68
+ "pages": True,
69
+ "issn": True
70
+ },
71
+ authors={
72
+ "given_name": True,
73
+ "surname": True,
74
+ "affiliations": True,
75
+ "is_corresponding": True
76
+ },
77
+ abstract=True,
78
+ subject_areas=True,
79
+ metrics={
80
+ "citation_count": True,
81
+ "source_citations": True
82
+ },
83
+ funding=True
84
+ )
85
+
86
+ # Create full configuration
87
+ full_config = ConfigAugmentation(
88
+ alex=alex_template,
89
+ elsevier=elsevier_template
90
+ )
src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (177 Bytes). View file
 
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (156 Bytes). View file
 
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (165 Bytes). View file
 
src/dataset/DataAugmenter.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import List, Dict, Any
3
+ from dataclasses import dataclass
4
+
5
+ import os
6
+ import yaml
7
+
8
+ import pyalex
9
+ from pyalex import Works
10
+ from src.utils.io_utils import PROJECT_ROOT
11
+
12
+
13
+ @dataclass
14
+ class ConfigAugmentation:
15
+ """Configuration for OpenAlex features"""
16
+ basic: Dict[str, bool] = None # id, doi, title, etc
17
+ source: Dict[str, bool] = None # journal info
18
+ authors: Dict[str, bool] = None # author details
19
+ metrics: Dict[str, bool] = None # citations, fwci, etc
20
+ classification: Dict[str, bool] = None # topics, concepts
21
+ access: Dict[str, bool] = None # OA status
22
+ related_works: Dict[str, bool] = None # references
23
+ abstract: bool = False
24
+
25
+ class DatasetType(Enum):
26
+ FULL_RAW = "full_raw"
27
+ PARTIAL_RAW = "partial_raw"
28
+ FULL_AUGMENTED = "full_augmented"
29
+ PARTIAL_AUGMENTED = "partial_augmented"
30
+
31
+
32
+ @dataclass
33
+ class Field:
34
+ """Field configuration for data extraction"""
35
+ name: str
36
+ path: List[str]
37
+ default: Any = None
38
+
39
+ class AlexFields:
40
+ """OpenAlex field definitions"""
41
+
42
+ BASIC = [
43
+ Field("id", ["id"]),
44
+ Field("doi", ["doi"]),
45
+ Field("title", ["title"]),
46
+ Field("display_name", ["display_name"]),
47
+ Field("publication_year", ["publication_year"]),
48
+ Field("publication_date", ["publication_date"]),
49
+ Field("language", ["language"]),
50
+ Field("type", ["type"]),
51
+ Field("type_crossref", ["type_crossref"])
52
+ ]
53
+
54
+ SOURCE = [
55
+ Field("journal_name", ["primary_location", "source", "display_name"]),
56
+ Field("issn", ["primary_location", "source", "issn"]),
57
+ Field("issn_l", ["primary_location", "source", "issn_l"]),
58
+ Field("publisher", ["primary_location", "source", "host_organization_name"]),
59
+ Field("type", ["primary_location", "source", "type"])
60
+ ]
61
+
62
+ METRICS = [
63
+ Field("cited_by_count", ["cited_by_count"]),
64
+ Field("cited_by_percentile", ["citation_normalized_percentile"]),
65
+ Field("is_retracted", ["is_retracted"]),
66
+ Field("fwci", ["fwci"]),
67
+ Field("referenced_works_count", ["referenced_works_count"])
68
+ ]
69
+
70
+ ACCESS = [
71
+ Field("is_oa", ["open_access", "is_oa"]),
72
+ Field("oa_status", ["open_access", "oa_status"]),
73
+ Field("oa_url", ["open_access", "oa_url"]),
74
+ Field("pdf_url", ["primary_location", "pdf_url"]),
75
+ Field("license", ["primary_location", "license"])
76
+ ]
77
+
78
+ def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any:
79
+ """Extract nested value from dictionary using path"""
80
+ value = data
81
+ for key in path:
82
+ try:
83
+ value = value[key]
84
+ except (KeyError, TypeError):
85
+ return default
86
+ return value
87
+
88
+ class DataAugmenter:
89
+ """Class for augmenting data with OpenAlex features"""
90
+
91
+ def __init__(self):
92
+ """Initialize augmenter with API credentials"""
93
+ self.profile = self._load_profile()
94
+ self.email = self.profile["email"]
95
+ self.filters = ConfigAugmentation(
96
+ basic={
97
+ "id": True,
98
+ "doi": True,
99
+ "title": True,
100
+ "display_name": True,
101
+ "publication_year": True,
102
+ "publication_date": True,
103
+ "language": True,
104
+ "type": True,
105
+ "type_crossref": True
106
+ },
107
+ source={
108
+ "journal_name": True,
109
+ "issn": True,
110
+ "issn_l": True,
111
+ "publisher": True,
112
+ "type": True
113
+ },
114
+ authors={
115
+ "position": True,
116
+ "name": True,
117
+ "id": True,
118
+ "orcid": True,
119
+ "is_corresponding": True,
120
+ "affiliations": False
121
+ },
122
+ metrics={
123
+ "cited_by_count": True,
124
+ "cited_by_percentile": False,
125
+ "is_retracted": True,
126
+ "fwci": True,
127
+ "referenced_works_count": True
128
+ },
129
+ classification={
130
+ "primary_topic": True,
131
+ "topics": False,
132
+ "concepts": False,
133
+ },
134
+ access={
135
+ "is_oa": True,
136
+ "oa_status": True,
137
+ "oa_url": True,
138
+ "pdf_url": True,
139
+ "license": True
140
+ },
141
+ related_works={
142
+ "references": True,
143
+ "referenced_by_count": True,
144
+ "related": True
145
+ },
146
+ abstract=True
147
+ )
148
+
149
+ pyalex.config.email = self.email
150
+
151
+ def _load_profile(self) -> Dict[str, str]:
152
+ """Load API credentials from profile"""
153
+ profile_path = f"{PROJECT_ROOT}/user_information/profile.yaml"
154
+
155
+ assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder "
156
+ assert os.path.exists(profile_path), "create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/"
157
+
158
+
159
+ with open(profile_path, "r") as f:
160
+ profile = yaml.safe_load(f)
161
+
162
+ return {
163
+ "email": profile["email"]
164
+ }
165
+
166
+ def get_alex_features(self, doi: str) -> Dict:
167
+ """Extract all OpenAlex features for a DOI"""
168
+ try:
169
+ work = Works()[f"https://doi.org/{doi}"]
170
+ result = {}
171
+
172
+ # Basic metadata
173
+ result["basic"] = {
174
+ field.name: get_nested_value(work, field.path, None)
175
+ for field in AlexFields.BASIC
176
+ }
177
+
178
+ # Source/journal info
179
+ result["source"] = {
180
+ field.name: get_nested_value(work, field.path, None)
181
+ for field in AlexFields.SOURCE
182
+ }
183
+
184
+ # Authors with affiliations
185
+ try:
186
+ result["authors"] = [
187
+ {
188
+ "position": auth.get("author_position", None),
189
+ "name": auth.get("author", {}).get("display_name", None),
190
+ "id": auth.get("author", {}).get("id", None),
191
+ "orcid": auth.get("author", {}).get("orcid", None),
192
+ "is_corresponding": auth.get("is_corresponding", None),
193
+ "affiliations": [
194
+ {
195
+ "name": inst.get("display_name", None),
196
+ "id": inst.get("id", None),
197
+ "country": inst.get("country_code", None),
198
+ "type": inst.get("type", None),
199
+ "ror": inst.get("ror", None)
200
+ }
201
+ for inst in auth.get("institutions", [])
202
+ ]
203
+ }
204
+ for auth in work.get("authorships", [])
205
+ ]
206
+ except:
207
+ result["authors"] = None
208
+
209
+ # Topics and classifications
210
+ try:
211
+ result["classification"] = {
212
+ "primary_topic": {
213
+ "name": work.get("primary_topic", {}).get("display_name", None),
214
+ "score": work.get("primary_topic", {}).get("score", None),
215
+ "field": work.get("primary_topic", {}).get("field", {}).get("display_name", None),
216
+ "subfield": work.get("primary_topic", {}).get("subfield", {}).get("display_name", None)
217
+ },
218
+ "topics": [
219
+ {
220
+ "name": topic.get("display_name", None),
221
+ "score": topic.get("score", None),
222
+ "field": topic.get("field", {}).get("display_name", None)
223
+ }
224
+ for topic in work.get("topics", [])
225
+ ],
226
+ "concepts": [
227
+ {
228
+ "name": concept.get("display_name", None),
229
+ "level": concept.get("level", None),
230
+ "score": concept.get("score", None),
231
+ "wikidata": concept.get("wikidata", None)
232
+ }
233
+ for concept in work.get("concepts", [])
234
+ ]
235
+ }
236
+ except:
237
+ result["classification"] = None
238
+
239
+ # Metrics
240
+ result["metrics"] = {
241
+ field.name: get_nested_value(work, field.path, None)
242
+ for field in AlexFields.METRICS
243
+ }
244
+
245
+ # Access info
246
+ result["access"] = {
247
+ field.name: get_nested_value(work, field.path, None)
248
+ for field in AlexFields.ACCESS
249
+ }
250
+
251
+ # Abstract
252
+ try:
253
+ if "abstract_inverted_index" in work:
254
+ abstract_dict = work["abstract_inverted_index"]
255
+ if abstract_dict:
256
+ max_pos = max(max(positions) for positions in abstract_dict.values())
257
+ words = [""] * (max_pos + 1)
258
+ for word, positions in abstract_dict.items():
259
+ for pos in positions:
260
+ words[pos] = word
261
+ result["abstract"] = " ".join(words)
262
+ else:
263
+ result["abstract"] = None
264
+ else:
265
+ result["abstract"] = None
266
+ except:
267
+ result["abstract"] = None
268
+
269
+ return result
270
+
271
+ except Exception as e:
272
+ print(f"OpenAlex error for DOI {doi}: {e}")
273
+ return {}
274
+
275
+ def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]:
276
+ """Filter data based on configuration
277
+
278
+ Args:
279
+ data: Dictionary containing raw data
280
+ config: Configuration specifying which features to include
281
+
282
+ Returns:
283
+ Filtered dictionary containing only the configured features
284
+ """
285
+ config = config or self.filters
286
+
287
+ def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]:
288
+ """Filter a section of the data based on the section configuration"""
289
+ return {k: v for k, v in section_data.items() if k in section_config and section_config[k]}
290
+
291
+ filtered_data = {}
292
+
293
+ # Filter OpenAlex data
294
+ alex_filtered = {}
295
+
296
+ # Basic metadata
297
+ if config.basic:
298
+ alex_filtered["basic"] = filter_section(data.get("basic", {}), config.basic)
299
+
300
+ # Source/journal info
301
+ if config.source:
302
+ alex_filtered["source"] = filter_section(data.get("source", {}), config.source)
303
+
304
+ # Authors
305
+ if config.authors:
306
+ authors_data = data.get("authors", [])
307
+ filtered_authors = []
308
+ for author in authors_data:
309
+ filtered_author = filter_section(author, config.authors)
310
+ if config.authors.get("affiliations", False):
311
+ print(author.get("affiliations", []))
312
+ filtered_author["affiliations"] = [
313
+ filter_section(aff, config.authors["affiliations"])
314
+ for aff in author.get("affiliations", [])
315
+ ]
316
+ filtered_authors.append(filtered_author)
317
+ alex_filtered["authors"] = filtered_authors
318
+
319
+ # Metrics
320
+ if config.metrics:
321
+ alex_filtered["metrics"] = filter_section(data.get("metrics", {}), config.metrics)
322
+
323
+ # Classification
324
+ if config.classification:
325
+ classification_data = data.get("classification", {})
326
+ alex_filtered["classification"] = {
327
+ k: v for k, v in classification_data.items() if k in config.classification and config.classification[k]
328
+ }
329
+
330
+ # Access info
331
+ if config.access:
332
+ alex_filtered["access"] = filter_section(data.get("access", {}), config.access)
333
+
334
+ # Related works
335
+ if config.related_works:
336
+ alex_filtered["related_works"] = filter_section(data.get("related_works", {}), config.related_works)
337
+
338
+ # Abstract
339
+ if config.abstract and "abstract" in data:
340
+ alex_filtered["abstract"] = data["abstract"]
341
+
342
+ filtered_data = alex_filtered
343
+
344
+ return filtered_data
src/dataset/Dataset.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.DataAugmenter import *
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+
6
+ class FullAugmentedDataset:
7
+
8
+ def __init__(self):
9
+ self.augmenter = DataAugmenter()
10
+ self.full_raw_dataset = self._load_the_dataset()
11
+
12
+ def _load_the_dataset(self, type: DatasetType = DatasetType.FULL_RAW) -> pd.DataFrame:
13
+ """Load as csv file one of the datasets for training."""
14
+ assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder "
15
+
16
+ if type == DatasetType.FULL_RAW:
17
+ return pd.read_csv(f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv")
18
+
19
+ def retrieve_dois_couple(self, len: int = 1, random: bool = False, seed: bool = None, full: bool = False):
20
+ """Retrieve two DOIs from the dataset"""
21
+ if random:
22
+ dois = self.full_raw_dataset.sample(n=len, random_state=seed)[["preprint_doi", "article_doi"]]
23
+ else:
24
+ dois = self.full_raw_dataset.head(len)[["preprint_doi", "article_doi"]]
25
+ if full:
26
+ dois = self.full_raw_dataset[["preprint_doi", "article_doi"]]
27
+ return dois.to_numpy()
28
+
29
+ @staticmethod
30
+ def _flatten_list(lst):
31
+ """
32
+ Flattens a nested list into a single list. If the input is not nested, it returns the original list.
33
+ Handles cases where some elements are lists and others are not.
34
+ """
35
+ if not isinstance(lst, list): # Ensure the input is a list
36
+ raise ValueError("Input must be a list")
37
+
38
+ def _flatten(sublist):
39
+ for item in sublist:
40
+ if isinstance(item, list): # Check if the item is a list
41
+ yield from _flatten(item) # Recursively flatten the list
42
+ else:
43
+ yield item # Yield the non-list item
44
+
45
+ return list(_flatten(lst))
46
+
47
+ def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.Series:
48
+ """Transform filtered augmented data into a pandas Series
49
+
50
+ Args:
51
+ filtered_data: Dictionary containing filtered OpenAlex and Elsevier data
52
+ preprint: If True, use prpnt_ prefix, else use article_ prefix
53
+
54
+ Returns:
55
+ pd.Series: Flattened data as a single row
56
+ """
57
+
58
+ additional_part = FullAugmentedDataset.filter_author(filtered_data.get("authors",{}))
59
+ # modify the key of additional part by adding authors_ at the beginning
60
+ additional_part = {f"authors_{k}": v for k, v in additional_part.items()}
61
+ # remove authos key from filtreed_info
62
+ filtered_data.pop("authors")
63
+ # append the additional part to the filtered_info
64
+ filtered_data.update(additional_part)
65
+ final_dictionary = FullAugmentedDataset.flatten_dict(filtered_data, preprint=preprint)
66
+
67
+ for k, v in final_dictionary.items():
68
+ final_dictionary[k] = "$@$".join(map(str, FullAugmentedDataset._flatten_list(v))) if isinstance(v, list) else [v]
69
+
70
+ return pd.DataFrame(final_dictionary)
71
+
72
+ @staticmethod
73
+ def filter_author(authors_info : list) -> dict:
74
+
75
+ try:
76
+ relevant_keys = authors_info[0].keys()
77
+ new_dict = {}
78
+ for key in relevant_keys:
79
+ new_dict[key] = [author[key] for author in authors_info]
80
+ return new_dict
81
+ except:
82
+ return {}
83
+
84
+ @staticmethod
85
+ def flatten_dict(d: dict, parent_key: str = '', sep: str = '_', preprint = True) -> dict:
86
+ """Flatten a nested dictionary.
87
+
88
+ Args:
89
+ d (dict): The dictionary to flatten.
90
+ parent_key (str): The base key string to use for the flattened keys.
91
+ sep (str): The separator to use between parent and child keys.
92
+
93
+ Returns:
94
+ dict: The flattened dictionary.
95
+ """
96
+ addition = "prpnt_" if preprint else "article_"
97
+ def _flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
98
+ items = []
99
+ for k, v in d.items():
100
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
101
+ if isinstance(v, dict):
102
+ items.extend(_flatten_dict(v, new_key, sep=sep).items())
103
+ else:
104
+ items.append((new_key, v))
105
+ return dict(items)
106
+ return {f"{addition}{k}": v for k, v in _flatten_dict(d, parent_key, sep).items()}
107
+
108
+ def process_pair(self, dois) -> pd.DataFrame:
109
+ """Process a pair of DOIs and return combined rows as a DataFrame"""
110
+ assert len(dois) > 0
111
+ rows = []
112
+ for preprint_doi, article_doi in tqdm(dois):
113
+ # Get preprint features
114
+ preprint_features = self.augmenter.get_alex_features(preprint_doi) # augment with all the features
115
+ preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) # filter the relevant features
116
+ preprint_row = self._augmented_data_to_row(preprint_filtered, True)
117
+
118
+ # Get article features
119
+ article_features = self.augmenter.get_alex_features(article_doi) # augment with all the features
120
+ article_filtered = self.augmenter.filter_augmented_data(article_features)
121
+ article_row = self._augmented_data_to_row(article_filtered, False)
122
+
123
+ rows.append([preprint_row, article_row])
124
+
125
+ return rows
126
+
127
+ @staticmethod
128
+ def transform_array(input_array, factor):
129
+ output_list = []
130
+
131
+ for i, row in enumerate(input_array):
132
+ other_indices = np.array([j for j in range(len(input_array)) if j != i])
133
+ sampled_indices = np.random.choice(other_indices, size=factor, replace=False)
134
+ sampled_rows = [input_array[j] for j in sampled_indices]
135
+
136
+ output_list.append(pd.concat([row[0], row[1], pd.DataFrame(data=[1], columns=['label'])], axis=1))
137
+ for B in sampled_rows:
138
+ output_list.append(pd.concat([row[0], B[1], pd.DataFrame(data=[0], columns=['label'])], axis=1))
139
+
140
+ return pd.concat(output_list).reset_index(drop=True)
141
+
142
+ def get_full_dataset(self, len: int = 1, random: bool = True, seed: int = 42, full: bool = True) -> pd.DataFrame:
143
+ """Process all DOI pairs and return full dataset"""
144
+ dois = self.retrieve_dois_couple(len, random, seed, full)
145
+ self.augmented_df = FullAugmentedDataset.transform_array(self.process_pair(dois), factor=4)
146
+ return self.augmented_df
src/dataset/GoodDataAugmenter.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import List, Dict, Any
3
+ from dataclasses import dataclass
4
+
5
+ import os
6
+ import yaml
7
+
8
+ import pyalex
9
+ from pyalex import Works
10
+ from src.utils.io_utils import PROJECT_ROOT
11
+
12
+ import time
13
+ from requests.exceptions import RequestException
14
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, wait_fixed
15
+
16
+
17
+
18
+ @dataclass
19
+ class ConfigAugmentation:
20
+ """Configuration for OpenAlex features"""
21
+ basic: Dict[str, bool] = None # id, doi, title, etc
22
+ source: Dict[str, bool] = None # journal info
23
+ authors: Dict[str, bool] = None # author details
24
+ metrics: Dict[str, bool] = None # citations, fwci, etc
25
+ classification: Dict[str, bool] = None # topics, concepts
26
+ access: Dict[str, bool] = None # OA status
27
+ related_works: Dict[str, bool] = None # references
28
+ abstract: bool = False
29
+
30
+ class DatasetType(Enum):
31
+ FULL_RAW = "full_raw"
32
+ PARTIAL_RAW = "partial_raw"
33
+ FULL_AUGMENTED = "full_augmented"
34
+ PARTIAL_AUGMENTED = "partial_augmented"
35
+
36
+
37
+ @dataclass
38
+ class Field:
39
+ """Field configuration for data extraction"""
40
+ name: str
41
+ path: List[str]
42
+ default: Any = None
43
+
44
+ class AlexFields:
45
+ """OpenAlex field definitions"""
46
+
47
+ BASIC = [
48
+ Field("id", ["id"]),
49
+ Field("doi", ["doi"]),
50
+ Field("title", ["title"]),
51
+ Field("display_name", ["display_name"]),
52
+ Field("publication_year", ["publication_year"]),
53
+ Field("publication_date", ["publication_date"]),
54
+ Field("language", ["language"]),
55
+ Field("type", ["type"]),
56
+ Field("type_crossref", ["type_crossref"])
57
+ ]
58
+
59
+ SOURCE = [
60
+ Field("journal_name", ["primary_location", "source", "display_name"]),
61
+ Field("issn", ["primary_location", "source", "issn"]),
62
+ Field("issn_l", ["primary_location", "source", "issn_l"]),
63
+ Field("publisher", ["primary_location", "source", "host_organization_name"]),
64
+ Field("type", ["primary_location", "source", "type"])
65
+ ]
66
+
67
+ METRICS = [
68
+ Field("cited_by_count", ["cited_by_count"]),
69
+ Field("cited_by_percentile", ["citation_normalized_percentile"]),
70
+ Field("is_retracted", ["is_retracted"]),
71
+ Field("fwci", ["fwci"]),
72
+ Field("referenced_works_count", ["referenced_works_count"])
73
+ ]
74
+
75
+ ACCESS = [
76
+ Field("is_oa", ["open_access", "is_oa"]),
77
+ Field("oa_status", ["open_access", "oa_status"]),
78
+ Field("oa_url", ["open_access", "oa_url"]),
79
+ Field("pdf_url", ["primary_location", "pdf_url"]),
80
+ Field("license", ["primary_location", "license"])
81
+ ]
82
+
83
+ def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any:
84
+ """Extract nested value from dictionary using path"""
85
+ value = data
86
+ for key in path:
87
+ try:
88
+ value = value[key]
89
+ except (KeyError, TypeError):
90
+ return default
91
+ return value
92
+
93
+
94
+ class DataAugmenter:
95
+ """Class for augmenting data with OpenAlex features"""
96
+
97
+ def __init__(self):
98
+ """Initialize augmenter with API credentials"""
99
+ self.profile = self._load_profile()
100
+ self.email = self.profile["email"]
101
+ self.filters = ConfigAugmentation(
102
+ basic={
103
+ "id": True,
104
+ "doi": True,
105
+ "title": True,
106
+ "display_name": True,
107
+ "publication_year": True,
108
+ "publication_date": True,
109
+ "language": True,
110
+ "type": True,
111
+ "type_crossref": True
112
+ },
113
+ source={
114
+ "journal_name": True,
115
+ "issn": True,
116
+ "issn_l": True,
117
+ "publisher": True,
118
+ "type": True
119
+ },
120
+ authors={
121
+ "position": True,
122
+ "name": True,
123
+ "id": True,
124
+ "orcid": True,
125
+ "is_corresponding": True,
126
+ "affiliations": False
127
+ },
128
+ metrics={
129
+ "cited_by_count": True,
130
+ "cited_by_percentile": False,
131
+ "is_retracted": True,
132
+ "fwci": True,
133
+ "referenced_works_count": True
134
+ },
135
+ classification={
136
+ "primary_topic": True,
137
+ "topics": False,
138
+ "concepts": False,
139
+ },
140
+ access={
141
+ "is_oa": True,
142
+ "oa_status": True,
143
+ "oa_url": True,
144
+ "pdf_url": True,
145
+ "license": True
146
+ },
147
+ related_works={
148
+ "references": True,
149
+ "referenced_by_count": True,
150
+ "related": True
151
+ },
152
+ abstract=True
153
+ )
154
+
155
+ pyalex.config.email = self.email
156
+
157
+ def _load_profile(self) -> Dict[str, str]:
158
+ """Load API credentials from profile"""
159
+ profile_path = f"{PROJECT_ROOT}/user_information/profile.yaml"
160
+
161
+ assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder "
162
+ assert os.path.exists(profile_path), "create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/"
163
+
164
+
165
+ with open(profile_path, "r") as f:
166
+ profile = yaml.safe_load(f)
167
+
168
+ return {
169
+ "email": profile["email"]
170
+ }
171
+
172
+ @retry(
173
+ stop=stop_after_attempt(5), # Retry up to 5 times
174
+ wait=wait_exponential(multiplier=1, min=1, max=60), # Exponential backoff,
175
+ # wait=wait_fixed(.2),
176
+ retry=retry_if_exception_type(RequestException)
177
+ )
178
+ def get_alex_features(self, doi: str) -> Dict:
179
+ """Extract all OpenAlex features for a DOI"""
180
+ try:
181
+ work = Works()[f"https://doi.org/{doi}"]
182
+ result = {}
183
+
184
+ # Basic metadata
185
+ result["basic"] = {
186
+ field.name: get_nested_value(work, field.path, None)
187
+ for field in AlexFields.BASIC
188
+ }
189
+
190
+ # Source/journal info
191
+ result["source"] = {
192
+ field.name: get_nested_value(work, field.path, None)
193
+ for field in AlexFields.SOURCE
194
+ }
195
+
196
+ # Authors with affiliations
197
+ try:
198
+ result["authors"] = [
199
+ {
200
+ "position": auth.get("author_position", None),
201
+ "name": auth.get("author", {}).get("display_name", None),
202
+ "id": auth.get("author", {}).get("id", None),
203
+ "orcid": auth.get("author", {}).get("orcid", None),
204
+ "is_corresponding": auth.get("is_corresponding", None),
205
+ "affiliations": [
206
+ {
207
+ "name": inst.get("display_name", None),
208
+ "id": inst.get("id", None),
209
+ "country": inst.get("country_code", None),
210
+ "type": inst.get("type", None),
211
+ "ror": inst.get("ror", None)
212
+ }
213
+ for inst in auth.get("institutions", [])
214
+ ]
215
+ }
216
+ for auth in work.get("authorships", [])
217
+ ]
218
+ except:
219
+ result["authors"] = None
220
+
221
+ # Topics and classifications
222
+ try:
223
+ result["classification"] = {
224
+ "primary_topic": {
225
+ "name": work.get("primary_topic", {}).get("display_name", None),
226
+ "score": work.get("primary_topic", {}).get("score", None),
227
+ "field": work.get("primary_topic", {}).get("field", {}).get("display_name", None),
228
+ "subfield": work.get("primary_topic", {}).get("subfield", {}).get("display_name", None)
229
+ },
230
+ "topics": [
231
+ {
232
+ "name": topic.get("display_name", None),
233
+ "score": topic.get("score", None),
234
+ "field": topic.get("field", {}).get("display_name", None)
235
+ }
236
+ for topic in work.get("topics", [])
237
+ ],
238
+ "concepts": [
239
+ {
240
+ "name": concept.get("display_name", None),
241
+ "level": concept.get("level", None),
242
+ "score": concept.get("score", None),
243
+ "wikidata": concept.get("wikidata", None)
244
+ }
245
+ for concept in work.get("concepts", [])
246
+ ]
247
+ }
248
+ except:
249
+ result["classification"] = None
250
+
251
+ # Metrics
252
+ result["metrics"] = {
253
+ field.name: get_nested_value(work, field.path, None)
254
+ for field in AlexFields.METRICS
255
+ }
256
+
257
+ # Access info
258
+ result["access"] = {
259
+ field.name: get_nested_value(work, field.path, None)
260
+ for field in AlexFields.ACCESS
261
+ }
262
+
263
+ # Abstract
264
+ try:
265
+ if "abstract_inverted_index" in work:
266
+ abstract_dict = work["abstract_inverted_index"]
267
+ if abstract_dict:
268
+ max_pos = max(max(positions) for positions in abstract_dict.values())
269
+ words = [""] * (max_pos + 1)
270
+ for word, positions in abstract_dict.items():
271
+ for pos in positions:
272
+ words[pos] = word
273
+ result["abstract"] = " ".join(words)
274
+ else:
275
+ result["abstract"] = None
276
+ else:
277
+ result["abstract"] = None
278
+ except:
279
+ result["abstract"] = None
280
+
281
+ return result
282
+
283
+ except Exception as e:
284
+ print(f"OpenAlex error for DOI {doi}")#: {e}")
285
+ # return {}
286
+ raise
287
+
288
+ def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]:
289
+ """Filter data based on configuration
290
+
291
+ Args:
292
+ data: Dictionary containing raw data
293
+ config: Configuration specifying which features to include
294
+
295
+ Returns:
296
+ Filtered dictionary containing only the configured features
297
+ """
298
+ config = config or self.filters
299
+
300
+ def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]:
301
+ """Filter a section of the data based on the section configuration"""
302
+ if not isinstance(section_data, dict): return {}
303
+ return {k: v for k, v in section_data.items() if k in section_config and section_config[k]}
304
+
305
+ filtered_data = {}
306
+
307
+ # Filter OpenAlex data
308
+ alex_filtered = {}
309
+
310
+ # Basic metadata
311
+ if config.basic:
312
+ alex_filtered["basic"] = filter_section(data.get("basic", {}), config.basic)
313
+
314
+ # Source/journal info
315
+ if config.source:
316
+ alex_filtered["source"] = filter_section(data.get("source", {}), config.source)
317
+
318
+ # Authors
319
+ if config.authors:
320
+ authors_data = data.get("authors", [])
321
+ filtered_authors = []
322
+ for author in authors_data:
323
+ filtered_author = filter_section(author, config.authors)
324
+ if config.authors.get("affiliations", False):
325
+ affiliations = author.get("affiliations", [])
326
+ filtered_author["affiliations"] = [
327
+ filter_section(aff, config.authors["affiliations"])
328
+ for aff in affiliations
329
+ ] if affiliations else []
330
+ filtered_authors.append(filtered_author)
331
+ alex_filtered["authors"] = filtered_authors
332
+
333
+ # Metrics
334
+ if config.metrics:
335
+ alex_filtered["metrics"] = filter_section(data.get("metrics", {}), config.metrics)
336
+
337
+ # Classification
338
+ if config.classification:
339
+ classification_data = data.get("classification", {})
340
+ alex_filtered["classification"] = {
341
+ k: v for k, v in classification_data.items() if k in config.classification and config.classification[k]
342
+ } if classification_data else {}
343
+
344
+ # Access info
345
+ if config.access:
346
+ alex_filtered["access"] = filter_section(data.get("access", {}), config.access)
347
+
348
+ # Related works
349
+ if config.related_works:
350
+ alex_filtered["related_works"] = filter_section(data.get("related_works", {}), config.related_works)
351
+
352
+ # Abstract
353
+ if config.abstract and "abstract" in data:
354
+ alex_filtered["abstract"] = data["abstract"]
355
+
356
+ filtered_data = alex_filtered
357
+
358
+ return filtered_data
359
+
360
+
361
+
src/dataset/GoodDataset.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.GoodDataAugmenter import *
2
+ from src.utils.struct_utils import *
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ import numpy as np
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ import pickle as pkl
8
+
9
+ class AugmentedDataset:
10
+ def __init__(self, path: str = None):
11
+ """
12
+ Initializes the AugmentedDataset object.
13
+ Loads the dataset and prepares the augmenter for data augmentation tasks.
14
+ """
15
+ self.augmenter = DataAugmenter()
16
+ self.full_raw_dataset = self._load_the_dataset(path)
17
+ self.positive_samples = None
18
+
19
+ def _load_the_dataset(self, path: str = None) -> pd.DataFrame:
20
+ """
21
+ Load the dataset as a CSV file.
22
+
23
+ Args:
24
+ type (str): The type of dataset to load (default is 'FULL_RAW').
25
+
26
+ Returns:
27
+ pd.DataFrame: The loaded dataset as a pandas DataFrame.
28
+ """
29
+ assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", \
30
+ "Please run this script in the project repository folder."
31
+
32
+ if not path:
33
+ return pd.read_csv(f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv")
34
+
35
+ return pd.read_csv(path)
36
+
37
+ def sample_dois_pairs(
38
+ self,
39
+ num_samples: int = 1,
40
+ random: bool = False,
41
+ seed: int = None,
42
+ full: bool = False
43
+ ) -> np.ndarray:
44
+ """
45
+ Sample DOI pairs from the dataset.
46
+
47
+ Args:
48
+ num_samples (int): Number of DOI pairs to sample.
49
+ random (bool): If True, sample randomly; otherwise, use the top rows.
50
+ seed (int): Random seed for reproducibility (used if random=True).
51
+ full (bool): If True, return all DOI pairs without sampling.
52
+
53
+ Returns:
54
+ np.ndarray: The sampled DOI pairs.
55
+ """
56
+ seed = seed if seed >= 0 else None
57
+ num_samples = min(num_samples, len(self.full_raw_dataset))
58
+
59
+ if full:
60
+ sampled_data = self.full_raw_dataset[["preprint_doi", "article_doi"]]
61
+ elif random:
62
+ sampled_data = self.full_raw_dataset.sample(n=num_samples, random_state=seed)[["preprint_doi", "article_doi"]]
63
+ else:
64
+ sampled_data = self.full_raw_dataset.iloc[:num_samples][["preprint_doi", "article_doi"]]
65
+
66
+ return sampled_data.to_numpy()
67
+
68
+
69
+ def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.DataFrame:
70
+ """Transform filtered augmented data into a pandas Series
71
+
72
+ Args:
73
+ filtered_data: Dictionary containing filtered OpenAlex and Elsevier data
74
+ preprint: If True, use prpnt_ prefix, else use article_ prefix
75
+
76
+ Returns:
77
+ pd.Series: Flattened data as a single row
78
+ """
79
+
80
+ authors_info = filtered_data.pop("authors", {})
81
+ if authors_info:
82
+ additional_part = {f"authors_{k}": v for k, v in authors_info[0].items()}
83
+ filtered_data.update(additional_part)
84
+
85
+ prefix = "prpnt_" if preprint else "article_"
86
+ final_dictionary = {f"{prefix}{k}": v for k, v in flatten_dict(filtered_data).items()}
87
+
88
+ for key, value in final_dictionary.items():
89
+ final_dictionary[key] = "$@$".join(map(str, flatten_list(value))) if isinstance(value, list) else [value]
90
+
91
+ return pd.DataFrame(final_dictionary)
92
+
93
+ def process_pairs(self, dois: np.ndarray) -> List[List[pd.DataFrame]]:
94
+ """
95
+ Process pairs of DOIs and return combined rows as a list of DataFrame pairs.
96
+
97
+ Args:
98
+ dois (np.ndarray): Array of DOI pairs.
99
+
100
+ Returns:
101
+ List[List[pd.DataFrame]]: List of preprint and article DataFrame pairs.
102
+ """
103
+ assert len(dois) > 0, "DOI pairs cannot be empty."
104
+
105
+ rows = []
106
+ for preprint_doi, article_doi in tqdm(dois, desc="Processing DOI pairs"):
107
+ preprint_features = self.augmenter.get_alex_features(preprint_doi)
108
+ article_features = self.augmenter.get_alex_features(article_doi)
109
+
110
+ preprint_filtered = self.augmenter.filter_augmented_data(preprint_features)
111
+ article_filtered = self.augmenter.filter_augmented_data(article_features)
112
+
113
+ preprint_row = self._augmented_data_to_row(preprint_filtered, True)
114
+ article_row = self._augmented_data_to_row(article_filtered, False)
115
+
116
+ rows.append([preprint_row, article_row])
117
+
118
+ return rows
119
+
120
+ def fetch_positive_samples(
121
+ self,
122
+ num_samples: int = 1,
123
+ random: bool = True,
124
+ seed: int = 42,
125
+ full: bool = True,
126
+ ):
127
+ """
128
+ Process all DOI pairs and return the full augmented dataset.
129
+
130
+ Args:
131
+ num_samples (int): Number of DOI pairs to process.
132
+ random (bool): Whether to sample DOI pairs randomly.
133
+ seed (int): Seed for reproducibility.
134
+ full (bool): If True, process the entire dataset.
135
+
136
+ Returns:
137
+
138
+ """
139
+ dois = self.sample_dois_pairs(num_samples, random, seed, full)
140
+ self.positive_samples = self.process_pairs(dois)
141
+ return self.positive_samples
142
+
143
+
144
+ def process_pairs_parallel(self, dois: np.ndarray, max_workers: int = 4) -> List[List[pd.DataFrame]]:
145
+ """
146
+ Process pairs of DOIs in parallel and return combined rows as a list of DataFrame pairs.
147
+
148
+ Args:
149
+ dois (np.ndarray): Array of DOI pairs.
150
+ max_workers (int): Number of threads to use for parallel processing.
151
+
152
+ Returns:
153
+ List[List[pd.DataFrame]]: List of preprint and article DataFrame pairs.
154
+ """
155
+ assert len(dois) > 0, "DOI pairs cannot be empty."
156
+
157
+ def process_single_pair(preprint_doi: str, article_doi: str) -> List[pd.DataFrame]:
158
+ """
159
+ Process a single DOI pair to extract preprint and article data.
160
+
161
+ Args:
162
+ preprint_doi (str): DOI for the preprint.
163
+ article_doi (str): DOI for the article.
164
+
165
+ Returns:
166
+ List[pd.DataFrame]: A list containing preprint and article rows.
167
+ """
168
+ try:
169
+ # Preprint features
170
+ preprint_features = self.augmenter.get_alex_features(preprint_doi)
171
+ preprint_filtered = self.augmenter.filter_augmented_data(preprint_features)
172
+ preprint_row = self._augmented_data_to_row(preprint_filtered, True)
173
+
174
+ # Article features
175
+ article_features = self.augmenter.get_alex_features(article_doi)
176
+ article_filtered = self.augmenter.filter_augmented_data(article_features)
177
+ article_row = self._augmented_data_to_row(article_filtered, False)
178
+
179
+ return [preprint_row, article_row]
180
+ except Exception as e:
181
+ print(f"Error processing pair ({preprint_doi}, {article_doi})")#: {e}")
182
+ return []
183
+
184
+ rows = []
185
+
186
+ # Use ThreadPoolExecutor for parallel processing
187
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
188
+ # Submit tasks to the executor
189
+ futures = {
190
+ executor.submit(process_single_pair, preprint_doi, article_doi): (preprint_doi, article_doi)
191
+ for preprint_doi, article_doi in dois
192
+ }
193
+
194
+ # Collect results as they complete
195
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Processing DOI pairs in parallel"):
196
+ try:
197
+ result = future.result()
198
+ if result: # Append only non-empty results
199
+ rows.append(result)
200
+ except Exception as e:
201
+ doi_pair = futures[future]
202
+ # print(f"Error with DOI pair {doi_pair}: {e}")
203
+
204
+ return rows
205
+
206
+ def fetch_positive_samples_parallel(
207
+ self,
208
+ num_samples: int = 1,
209
+ random: bool = True,
210
+ seed: int = 42,
211
+ full: bool = True,
212
+ ):
213
+ """
214
+ Process all DOI pairs and return the full augmented dataset.
215
+
216
+ Args:
217
+ num_samples (int): Number of DOI pairs to process.
218
+ random (bool): Whether to sample DOI pairs randomly.
219
+ seed (int): Seed for reproducibility.
220
+ full (bool): If True, process the entire dataset.
221
+
222
+ Returns:
223
+
224
+ """
225
+ dois = self.sample_dois_pairs(num_samples, random, seed, full)
226
+ self.positive_samples = self.process_pairs_parallel(dois)
227
+ return self.positive_samples
228
+
229
+ def augment_dataset(
230
+ self,
231
+ augmentation_factor: int = 4,
232
+ # possible augmenation parameters
233
+ ):
234
+ self.augmented_df = self.transform_array(self.positive_pairs, factor=augmentation_factor)
235
+
236
+ def save(self, path: str):
237
+ with open(path, 'wb') as file:
238
+ pkl.dump(self.positive_samples, file)
239
+
240
+ def load(self, path: str):
241
+ with open(path, 'rb') as file:
242
+ self.positive_samples = pkl.load(file)
243
+
244
+ def save_csv(self, path: str):
245
+ custom_struct_to_df(self.positive_samples).to_csv(path)
246
+
247
+ def load_csv(self, path: str):
248
+ self.positive_samples = df_to_custom_struct(pd.read_csv(path))
src/dataset/NegativeSampler.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import List, Dict, Any, Union, Set, Callable
3
+ import copy
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ import nltk
9
+ from nltk.corpus import words
10
+ nltk.download("words")
11
+
12
+ from src.dataset.GoodDataset import *
13
+
14
+
15
+ def copy_column_value(
16
+ df1: pd.DataFrame,
17
+ df2: pd.DataFrame,
18
+ source_col: str,
19
+ target_col:str,
20
+ source_transform: Callable[[Any], Any] = lambda x: x
21
+ ) -> List[pd.DataFrame]:
22
+ """
23
+ Copies the value from `source_col` in `df1` to `target_col` in `df2`,
24
+ while ensuring that the original DataFrames remain unaltered by
25
+ working on deep copies.
26
+
27
+ Args:
28
+ df1 (pd.DataFrame): The source DataFrame containing the value to copy.
29
+ df2 (pd.DataFrame): The target DataFrame where the value will be copied.
30
+ source_col (str): The column name in `df1` from which the value will be sourced.
31
+ target_col (str): The column name in `df2` where the value will be written.
32
+
33
+ Returns:
34
+ List[pd.DataFrame]: A list containing the original `df1` and the modified copy of `df2`.
35
+ """
36
+ # Create a deepcopy of `df2` to ensure the original DataFrame remains unchanged.
37
+ df2_copy = copy.deepcopy(df2)
38
+
39
+ # Extract the value from the first row of the specified source column in `df1`.
40
+ value_to_copy = df1.iloc[0][source_col]
41
+
42
+ # Write the extracted value to the first row of the specified target column in the copied `df2`.
43
+ df2_copy.at[0, target_col] = source_transform(value_to_copy)
44
+
45
+ return [df1, df2_copy]
46
+
47
+ def keep_on_condition(
48
+ dataset: List[List[pd.DataFrame]],
49
+ column_to_check: str,
50
+ indices_to_ignore: Union[List[int], Set[int], int],
51
+ function_to_compare: Callable[[Any], bool]
52
+ ) -> List[List[pd.DataFrame]]:
53
+ """
54
+ Filters a dataset based on a column value and ignores specified indices.
55
+
56
+ Args:
57
+ dataset (List[List[pd.DataFrame]]): The dataset to filter, organized as a list of pairs of DataFrames.
58
+ column_to_check (str): The column in the article DataFrame to check for values.
59
+ values_to_keep (Union[List[Any], Set[Any], Any]): Values to keep in the filtering process.
60
+ indices_to_ignore (Union[List[int], Set[int], int]): Indices to ignore during filtering.
61
+ article_transform (Callable[[Any], Any], optional): Transformation function for column values. Defaults to identity.
62
+
63
+ Returns:
64
+ List[List[pd.DataFrame]]: Filtered dataset.
65
+ """
66
+
67
+ # Normalize `indices_to_ignore` to a set
68
+ if isinstance(indices_to_ignore, int):
69
+ indices_to_ignore = {indices_to_ignore}
70
+ elif isinstance(indices_to_ignore, list):
71
+ indices_to_ignore = set(indices_to_ignore)
72
+
73
+ # Filter dataset
74
+ return [
75
+ [preprint, article]
76
+ for i, (preprint, article) in enumerate(dataset)
77
+ if (
78
+ f"article_{column_to_check}" in article.columns and
79
+ function_to_compare(article[f"article_{column_to_check}"].iloc[0]) and
80
+ i not in indices_to_ignore
81
+ )
82
+ ]
83
+
84
+ class NegativeSampler:
85
+ # def __init__(self, positive_samples: List[List[pd.DataFrame]]):
86
+ def __init__(self, dataset: AugmentedDataset):
87
+ """
88
+ Initializes the NegativeSampler with a dataset of preprint-article pairs.
89
+ :param positive_samples: List of dictionaries, each containing information about preprints and articles.
90
+ """
91
+ self.dataset = dataset
92
+ self.positive_samples = dataset.positive_samples
93
+
94
+ ### ARGUMENTS for negative sampling here?
95
+
96
+ def sample_random(
97
+ self,
98
+ preprint_index: int,
99
+ factor_max: int = 4,
100
+ random_state: int = -1,
101
+ custom_samples: List[List[pd.DataFrame]] = None
102
+ ) -> List[List[pd.DataFrame]]:
103
+ """
104
+ Randomly samples a non-matching article to create the negative sample.
105
+ :param preprint: The preprint for which to create a negative sample.
106
+ :return: A randomly selected negative sample.
107
+ """
108
+ if random_state >= 0:
109
+ np.random.seed(random_state)
110
+
111
+ positive_samples = custom_samples if custom_samples is not None else self.positive_samples
112
+
113
+ factor = min(len(positive_samples), factor_max)
114
+ assert factor >= 1, "Dataset doesn't contain enough samples"
115
+
116
+ # Sample `factor` non-matching articles from the dataset to create the negative samples
117
+ other_indices = np.array([j for j in range(len(positive_samples)) if j != preprint_index])
118
+ sampled_indices = np.random.choice(other_indices, size=factor, replace=False)
119
+ sampled_rows = [positive_samples[j] for j in sampled_indices]
120
+
121
+ if preprint_index < 0:
122
+ return sampled_rows
123
+
124
+ # Create and return the negative samples using the original preprint and the sampled article
125
+ preprint, _ = positive_samples[preprint_index]
126
+ return [
127
+ [preprint, non_matching_article]
128
+ for _, non_matching_article in sampled_rows
129
+ ]
130
+
131
+ def fuzz_title(
132
+ self,
133
+ fuzz_count: int = -1,
134
+ custom_samples: List[List[pd.DataFrame]] = None
135
+ ) -> List[List[pd.DataFrame]]:
136
+ """
137
+ Fuzzes out the title to create the negative sample. Likely changes the abstract and/or authors.
138
+ :param preprint: The preprint for which to create a negative sample.
139
+ :param fuzz_factor: A threshold for title similarity (0.0 to 1.0).
140
+ :return: A negative sample with a fuzzed title.
141
+ """
142
+ def replace_with_random_words(text: str, fuzz_count: int = fuzz_count) -> str:
143
+ """
144
+ Replaces a specified number of words in the input string with random words
145
+ from the NLTK `words` corpus.
146
+
147
+ Args:
148
+ text (str): The input string to fuzz.
149
+ fuzz_count (int): The number of words to replace in the string.
150
+
151
+ Returns:
152
+ str: The string with random word replacements.
153
+ """
154
+ if fuzz_count == -1:
155
+ fuzz_count = text.count(" ") // 2
156
+
157
+ # Load the list of English words from the NLTK corpus
158
+ word_list = words.words()
159
+
160
+ # Split the input text into a list of words
161
+ text_words = text.split()
162
+
163
+ for _ in range(fuzz_count):
164
+ # Randomly pick a word in the text to replace
165
+ index_to_replace = random.randint(0, len(text_words) - 1)
166
+
167
+ # Replace it with a random word from the NLTK corpus
168
+ random_word = random.choice(word_list)
169
+ text_words[index_to_replace] = random_word
170
+
171
+ # Join the list back into a string and return
172
+ return " ".join(text_words)
173
+
174
+ return [
175
+ copy_column_value(preprint, article, "prpnt_basic_title", "article_basic_title", replace_with_random_words)
176
+ for preprint, article in (custom_samples or self.positive_samples)
177
+ ]
178
+
179
+ def sample_authors_overlap_random(
180
+ self,
181
+ custom_samples: List[List[pd.DataFrame]] = None
182
+ ) -> List[List[pd.DataFrame]]:
183
+ """
184
+ Samples a random non-matching article and replaces its authors with the preprint authors to create the negative sample.
185
+ :param preprint: The preprint for which to create a negative sample.
186
+ :return: A negative sample with authors replaced.
187
+ """
188
+ return [
189
+ copy_column_value(preprint, article, "prpnt_authors_id", "article_authors_id")
190
+ for preprint, article in (custom_samples or self.positive_samples)
191
+ ]
192
+
193
+ def sample_authors_overlap(
194
+ self,
195
+ preprint_index: int,
196
+ factor_max: int = 4,
197
+ random_state: int = -1,
198
+ authors_to_consider: int = 1,
199
+ overlapping_authors: int = 1
200
+ ) -> List[List[pd.DataFrame]]:
201
+ """
202
+ Samples a published article with some author overlap to create the negative sample.
203
+ :param preprint: The preprint for which to create a negative sample.
204
+ :return: A negative sample with some author overlap.
205
+ """
206
+ def extract_authors(authors_str: str, authors_to_keep: int = -1) -> list:
207
+ """
208
+ Extracts a list of authors from a string, with an optional limit on the number of authors to return.
209
+
210
+ Args:
211
+ authors_str (str): A string containing authors, expected to include `openalex` and be separated by `$@$`.
212
+ authors_to_keep (int, optional): The number of authors to keep. If -1, all authors except the last one are kept. Defaults to -1.
213
+
214
+ Returns:
215
+ list: A list of authors, truncated to the specified number if `authors_to_keep` is provided.
216
+
217
+ Raises:
218
+ ValueError: If `authors_str` does not contain the substring `openalex`.
219
+ """
220
+
221
+ # Split the authors string into a list using the custom delimiter `$@$`
222
+ authors_list = authors_str.split("$@$")
223
+
224
+ if not authors_list:
225
+ raise ValueError(f"Invalid input: {authors_str}. The string must contain 'openalex'.")
226
+
227
+ # Determine how many authors to keep
228
+ if authors_to_keep == -1:
229
+ authors_to_keep = len(authors_list) # Exclude the last item
230
+
231
+ # Return the truncated list of authors
232
+ return authors_list[:authors_to_keep]
233
+
234
+ suffix = "authors_id"
235
+ positive_preprint, _ = self.positive_samples[preprint_index]
236
+ preprint_authors = set(extract_authors(positive_preprint[f"prpnt_{suffix}"].iloc[0]))
237
+
238
+
239
+ def confirm_overlap(article_authors):
240
+ article_authors = set(extract_authors(article_authors, authors_to_consider))
241
+ if len(preprint_authors.intersection(article_authors)) >= overlapping_authors:
242
+ print(f"\t{article_authors}")
243
+ return len(preprint_authors.intersection(article_authors)) >= overlapping_authors
244
+
245
+ # Collect preprint-article pairs where the article has some overlapping authors with the selected preprint.
246
+ # Exclude the pair matching the selected preprint to ensure proper functionality of random sampling later.
247
+ custom_samples = keep_on_condition(
248
+ self.positive_samples,
249
+ suffix,
250
+ preprint_index,
251
+ confirm_overlap
252
+ )
253
+
254
+ # If preprint_index == -1, no index is excluded from being sampled by sample_random.
255
+ # This is because the indices are derived from the following logic:
256
+ # np.array([j for j in range(len(positive_samples)) if j != preprint_index]).
257
+ # Since j >= 0 and preprint_index is -1, the condition (j != preprint_index) always evaluates to True.
258
+ return [
259
+ (positive_preprint, article)
260
+ for _, article in self.sample_random(-1, factor_max, random_state, custom_samples)
261
+ ]
262
+
263
+ def sample_similar_topic(
264
+ self,
265
+ preprint_index: int,
266
+ factor_max: int = 4,
267
+ random_state: int = -1
268
+ ) -> List[List[pd.DataFrame]]:
269
+ """
270
+ Samples a non-matching article with the same topic to create the negative sample.
271
+ :param preprint: The preprint for which to create a negative sample.
272
+ :param topic_key: The key in the dataset that contains the topics.
273
+ :return: A negative sample with a similar topic.
274
+ """
275
+ suffix = "classification_primary_topic_field"
276
+ positive_preprint, positive_article = self.positive_samples[preprint_index]
277
+
278
+ # Collect preprint-article pairs where the article shares the same topic as the selected preprint.
279
+ # Exclude the pair matching the selected preprint to ensure proper functionality of random sampling later.
280
+ custom_samples = keep_on_condition(
281
+ self.positive_samples,
282
+ suffix,
283
+ preprint_index,
284
+ lambda x: x == positive_article[f"article_{suffix}"].iloc[0]
285
+ )
286
+
287
+ # If preprint_index == -1, no index is excluded from being sampled by sample_random.
288
+ # This is because the indices are derived from the following logic:
289
+ # np.array([j for j in range(len(positive_samples)) if j != preprint_index]).
290
+ # Since j >= 0 and preprint_index is -1, the condition (j != preprint_index) always evaluates to True.
291
+ return [
292
+ (positive_preprint, article)
293
+ for _, article in self.sample_random(-1, factor_max, random_state, custom_samples)
294
+ ]
295
+
296
+ def create_negative_samples(self, config):
297
+ """
298
+ Generate negative samples based on the configuration.
299
+ """
300
+ negative_samples = []
301
+ for preprint_index in tqdm(range(len(self.positive_samples)), desc="Negative Sampling"):
302
+ negatives = []
303
+ if config.overlap_auth and not config.overlap_topic:
304
+ negatives = self.sample_authors_overlap(
305
+ preprint_index, factor_max=config.factor_max,
306
+ random_state=config.seed,
307
+ authors_to_consider=config.authors_to_consider,
308
+ overlapping_authors=config.overlapping_authors
309
+ )
310
+ elif config.overlap_topic and not config.overlap_auth:
311
+ negatives = self.sample_similar_topic(preprint_index, factor_max=config.factor_max, random_state=config.seed)
312
+ elif config.random:
313
+ negatives = self.sample_random(preprint_index, factor_max=config.factor_max, random_state=config.seed)
314
+ else:
315
+ continue
316
+
317
+ if config.fuzz_title:
318
+ negatives = self.fuzz_title(custom_samples=negatives)
319
+
320
+ if config.replace_auth:
321
+ negatives = self.sample_authors_overlap_random(negatives)
322
+
323
+ negative_samples.extend(negatives)
324
+
325
+ self.dataset.negative_samples = negative_samples
src/dataset/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .DataAugmenter import *
2
+ from .Dataset import *
src/dataset/__pycache__/DataAugmenter.cpython-311.pyc ADDED
Binary file (17.7 kB). View file
 
src/dataset/__pycache__/DataAugmenter.cpython-312.pyc ADDED
Binary file (14.2 kB). View file
 
src/dataset/__pycache__/DataAugmenter.cpython-313.pyc ADDED
Binary file (14.5 kB). View file
 
src/dataset/__pycache__/Dataset.cpython-312.pyc ADDED
Binary file (9.72 kB). View file
 
src/dataset/__pycache__/Dataset.cpython-313.pyc ADDED
Binary file (9.71 kB). View file
 
src/dataset/__pycache__/GoodDataAugmenter.cpython-313.pyc ADDED
Binary file (14.9 kB). View file
 
src/dataset/__pycache__/GoodDataset.cpython-313.pyc ADDED
Binary file (11.9 kB). View file
 
src/dataset/__pycache__/NegativeSampler.cpython-313.pyc ADDED
Binary file (14.3 kB). View file
 
src/dataset/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (252 Bytes). View file
 
src/dataset/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (221 Bytes). View file
 
src/dataset/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (230 Bytes). View file
 
src/dataset/get_dataset.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.DataAugmenter import *
2
+ from src.dataset.Dataset import *
3
+
4
+ import argparse
5
+
6
+ def main(config):
7
+ """
8
+ Main function to process the dataset and save it as a CSV file.
9
+ Args:
10
+ config: Namespace object containing the script arguments.
11
+ """
12
+ # Initialize the dataset
13
+ dataset = FullAugmentedDataset()
14
+
15
+ # Get the dataset with the specified parameters
16
+ df = dataset.get_full_dataset(
17
+ len=config.size,
18
+ random=config.random,
19
+ seed=config.seed,
20
+ full=config.full
21
+ )
22
+
23
+ # Write the resulting DataFrame to a CSV file
24
+ df.to_csv(config.output, index=False)
25
+ print(f"Dataset successfully saved to {config.output}")
26
+
27
+ if __name__ == "__main__":
28
+ # Parse command-line arguments
29
+ from src.utils.io_utils import PROJECT_ROOT
30
+ parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.")
31
+
32
+ parser.add_argument("-s", "--size", type=int, default=10, help="Number of samples to generate.")
33
+ parser.add_argument("-r", "--random", type=bool, default=True, help="Whether to sample randomly.")
34
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.")
35
+ parser.add_argument("--full", action="store_true", help="Boolean flag to indicate full dataset mode.")
36
+ parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/out.csv"), help="Output file path to save the dataset as a CSV.")
37
+
38
+ # Parse the arguments and pass to the main function
39
+ config = parser.parse_args()
40
+ print(config.full)
41
+ main(config)
src/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .io_utils import *
src/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (216 Bytes). View file
 
src/utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (192 Bytes). View file
 
src/utils/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (201 Bytes). View file
 
src/utils/__pycache__/io_utils.cpython-311.pyc ADDED
Binary file (2.55 kB). View file
 
src/utils/__pycache__/io_utils.cpython-312.pyc ADDED
Binary file (2.16 kB). View file
 
src/utils/__pycache__/io_utils.cpython-313.pyc ADDED
Binary file (2.17 kB). View file
 
src/utils/__pycache__/struct_utils.cpython-313.pyc ADDED
Binary file (5.27 kB). View file