diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..461219f1ffbd91aaf2e9fb307532631521bd710d Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 87198b415806a683909bd94878c339cf518c2569..cbab0bd8fe885762ef779c4153a8720d3cd0dccd 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,16 @@ ---- -title: MatchPrePrintArticles -emoji: ๐ŸŒ– -colorFrom: green -colorTo: pink -sdk: gradio -sdk_version: 5.8.0 -app_file: app.py -pinned: false -license: mit -short_description: Dataset Creator for Matching PrePrint and Articles ---- - -Check out the configuration reference at https://huggingface.co./docs/hub/spaces-config-reference +# MatchingPubs + +## Dataset + +The `dataset/` directory contains the following main classes: + +- `DatasetLoader`: Responsible for loading the dataset from various sources. +- `DatasetProcessor`: Handles preprocessing and cleaning of the dataset. +- `DatasetAnalyzer`: Provides methods for analyzing and summarizing the dataset. + +## Getting the Dataset + +To get the dataset, run the following command: + +```bash +PYTHONPATH=$(pwd) python src/dataset/get_dataset.py \ No newline at end of file diff --git a/__pycache__/run_augmenter.cpython-313.pyc b/__pycache__/run_augmenter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a36f28f04872c18b7f4a905b059446b097dcae21 Binary files /dev/null and b/__pycache__/run_augmenter.cpython-313.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..7642dd5f336762866380029d295890baec28bb2b --- /dev/null +++ b/app.py @@ -0,0 +1,245 @@ +import gradio as gr +import pandas as pd +import pandas as pd +from src.utils.io_utils import PROJECT_ROOT +from run_augmenter import negative_sampler , positive_sampler +from pathlib import Path + +def augment_interface(factor, type_or_difficulty, use_default, csv_file=None): + """Negative Tool Sampler: Wrapper to handle negative dataset augmentation.""" + try: + if use_default: + input_csv_path = f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" + if not Path(input_csv_path).exists(): + return "Error: Default CSV file not found!", None, gr.update(visible=False) + elif csv_file is not None: + input_csv_path = csv_file.name + else: + return "Error: Please select default or upload a CSV file.", None, gr.update(visible=False) + + augmented_df = negative_sampler(input_csv_path, factor, type_or_difficulty) + output_csv_path = "augmented_dataset.csv" + augmented_df.to_csv(output_csv_path, index=False) + + return output_csv_path, augmented_df.head(), gr.update(visible=True) + + except Exception as e: + return f"Error during processing: {str(e)}", None, gr.update(visible=False) + + +def positive_sampler_interface(use_default, csv_file=None, size=10, random=True, seed=42, full=False): + """Positive Tool Sampler: Wrapper to handle positive dataset augmentation with additional arguments.""" + try: + if use_default: + input_csv_path = f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" + if not Path(input_csv_path).exists(): + return "Error: Default CSV file not found!", None, gr.update(visible=False) + elif csv_file is not None: + input_csv_path = csv_file.name + else: + return "Error: Please select default or upload a CSV file.", None, gr.update(visible=False) + + # Call the positive sampler function with additional arguments + augmented_df = positive_sampler( + optional_path=input_csv_path, + size=size, + random=random, + seed=seed, + full=full + ) + output_csv_path = "positive_augmented_dataset.csv" + augmented_df.to_csv(output_csv_path, index=False) + + return output_csv_path, augmented_df.head(), gr.update(visible=True) + + except Exception as e: + return f"Error during processing: {str(e)}", None, gr.update(visible=False) + + +def reset_output(): + """Resets the output fields by returning None and hiding the DataFrame.""" + return None, None, gr.update(visible=False) + +with gr.Blocks(css=f""" + .gradio-container {{ + font-family: Arial, sans-serif; + max-width: 900px; + margin: auto; + }} + h1 {{ + text-align: center; + color: white; + font-size: 60px; + margin-bottom: 0px; + }} + h2 {{ + text-align: center; + color: #ff0000; + font-size: 16px; + font-weight: normal; + margin-top: 0px; + }} + .title {{ + text-align: center; + font-size: 40px; + margin-top: 30px; + margin-bottom: 20px; + }} + .title .positive {{ + color: #ff0000; + }} + .title .negative {{ + color: #ff0000; + }} + .title .tool {{ + color: white; + }} + .title .sampler {{ + color: #ff0000; + }} + .description {{ + text-align: center; + margin-bottom: 20px; + }} + #submit-button {{ + background-color: #ff0000; + color: white; + font-size: 16px; + border: none; + border-radius: 5px; + padding: 10px 20px; + }} + #reset-button {{ + background-color: #d3d3d3; + color: black; + font-size: 16px; + border: none; + border-radius: 5px; + padding: 10px 20px; + }} +""") as app: + # Main Title Section + gr.Markdown(""" +

ENTC

+

Entrepreneurship and Technology Commercialization ยท EPFL

+ """) + + # Positive Tool Sampler Section + gr.Markdown(""" +
+ Positive + Tool + Sampler +
+ """) + + gr.Markdown(""" +

+ This tool takes a list of DOIs and augments them using the OpenAlex API. + It is designed to complement the Negative Tool Sampler, enabling the creation of complete datasets. +

+ """) + + with gr.Group(): + with gr.Row(): + pos_use_default_checkbox = gr.Checkbox(label="Use Default Dataset", value=True) + pos_csv_file_input = gr.File(label="Upload CSV (optional)", file_types=[".csv"], visible=False) + + with gr.Row(): + size_input = gr.Number(label="Number of Samples", value=10, info="Specify the number of samples to generate.") + random_input = gr.Checkbox(label="Sample Randomly", value=True, info="Whether to sample randomly.") + seed_input = gr.Number(label="Random Seed", value=42, info="Random seed for reproducibility.") + full_input = gr.Checkbox(label="Full Dataset Mode", value=False, info="Indicate whether to use the full dataset.") + + with gr.Group(): + pos_output_file = gr.File(label="Download Augmented Dataset") + pos_dataset_preview = gr.DataFrame(label="Dataset Preview", interactive=False, visible=False) + with gr.Row(): + pos_submit_button = gr.Button("Submit ๐Ÿš€", elem_id="submit-button") + pos_reset_button = gr.Button("Reset ๐Ÿ”„", elem_id="reset-button") + + # Button Actions + pos_submit_button.click( + positive_sampler_interface, + inputs=[pos_use_default_checkbox, pos_csv_file_input, size_input, random_input, seed_input, full_input], + outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview] + ) + + pos_reset_button.click( + reset_output, + inputs=[], + outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview] + ) + + # Toggle File Input + def toggle_pos_csv_input(use_default): + return gr.update(visible=not use_default) + + pos_use_default_checkbox.change( + toggle_pos_csv_input, + inputs=[pos_use_default_checkbox], + outputs=[pos_csv_file_input] + ) + + # Negative Tool Sampler Section + gr.Markdown(""" +
+ Negative + Tool + Sampler +
+ """) + + gr.Markdown(""" +

+ This tool generates datasets by creating negative samples from positive matches between preprints and articles. + Customize the difficulty and the augmentation factor to meet your needs. +

+ """) + + with gr.Group(): + with gr.Row(): + factor_input = gr.Number( + label="Factor (int)", value=1, info="Specify the number of negative samples per positive sample." + ) + type_dropdown = gr.Dropdown( + ["random", "similar topics", "overlapping authors", "random authors", "fuzzed title"], + label="Select Difficulty or Augmentation Type" + ) + with gr.Row(): + use_default_checkbox = gr.Checkbox(label="Use Default Dataset", value=True) + csv_file_input = gr.File(label="Upload CSV (optional)", file_types=[".csv"], visible=False) + + with gr.Group(): + output_file = gr.File(label="Download Augmented Dataset") + dataset_preview = gr.DataFrame(label="Dataset Preview", interactive=False, visible=False) + with gr.Row(): + submit_button = gr.Button("Submit ๐Ÿš€", elem_id="submit-button") + reset_button = gr.Button("Reset ๐Ÿ”„", elem_id="reset-button") + + # Button Actions + submit_button.click( + augment_interface, + inputs=[factor_input, type_dropdown, use_default_checkbox, csv_file_input], + outputs=[output_file, dataset_preview, dataset_preview] + ) + + reset_button.click( + reset_output, + inputs=[], + outputs=[output_file, dataset_preview, dataset_preview] + ) + + # Toggle File Input + def toggle_csv_input(use_default): + return gr.update(visible=not use_default) + + use_default_checkbox.change( + toggle_csv_input, + inputs=[use_default_checkbox], + outputs=[csv_file_input] + ) + +# Launch the app +if __name__ == "__main__": + app.launch(share=True) diff --git a/app/.DS_Store b/app/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8a784558b10624a4f3f9bfc5695a8352a42200c4 Binary files /dev/null and b/app/.DS_Store differ diff --git a/app/.gradio/certificate.pem b/app/.gradio/certificate.pem new file mode 100644 index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3 --- /dev/null +++ b/app/.gradio/certificate.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/app/.gradio/flagged/dataset1.csv b/app/.gradio/flagged/dataset1.csv new file mode 100644 index 0000000000000000000000000000000000000000..5c5833b1a834add3c7b8ac35b5cdfab7d87bc5df --- /dev/null +++ b/app/.gradio/flagged/dataset1.csv @@ -0,0 +1,3 @@ +Factor (int),Select Augmentation Type or Difficulty,Use Default Dataset,Upload CSV (optional),Download Augmented Dataset,timestamp +,,false,,,2024-12-10 22:00:22.460971 +1,easy,true,,,2024-12-10 22:00:36.882145 diff --git a/app/__pycache__/run_augmenter.cpython-313.pyc b/app/__pycache__/run_augmenter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b99bbab4c4d936595f389c29efa526c0970d0f4 Binary files /dev/null and b/app/__pycache__/run_augmenter.cpython-313.pyc differ diff --git a/app/app.ipynb b/app/app.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..209f0c7a388eb8c6ed4a723d0327884a0b459e94 --- /dev/null +++ b/app/app.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/giorgosnikolaou/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n", + "[nltk_data] Downloading package words to\n", + "[nltk_data] /Users/giorgosnikolaou/nltk_data...\n", + "[nltk_data] Package words is already up-to-date!\n" + ] + } + ], + "source": [ + "import gradio as gr\n", + "import pandas as pd\n", + "import pandas as pd\n", + "from src.utils.io_utils import PROJECT_ROOT\n", + "from run_augmenter import negative_sampler , positive_sampler\n", + "from pathlib import Path\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://127.0.0.1:7860\n", + "Running on public URL: https://85b886469a8c17104c.gradio.live\n", + "\n", + "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co./spaces)\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "random\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Negative Sampling: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100/100 [00:11<00:00, 8.43it/s]\n" + ] + } + ], + "source": [ + "\n", + "def augment_interface(factor, type_or_difficulty, use_default, csv_file=None):\n", + " \"\"\"Negative Tool Sampler: Wrapper to handle negative dataset augmentation.\"\"\"\n", + " try:\n", + " if use_default:\n", + " input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n", + " if not Path(input_csv_path).exists():\n", + " return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n", + " elif csv_file is not None:\n", + " input_csv_path = csv_file.name\n", + " else:\n", + " return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n", + "\n", + " augmented_df = negative_sampler(input_csv_path, factor, type_or_difficulty)\n", + " output_csv_path = \"augmented_dataset.csv\"\n", + " augmented_df.to_csv(output_csv_path, index=False)\n", + "\n", + " return output_csv_path, augmented_df.head(), gr.update(visible=True)\n", + "\n", + " except Exception as e:\n", + " return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n", + "\n", + "\n", + "def positive_sampler_interface(use_default, csv_file=None, size=10, random=True, seed=42, full=False):\n", + " \"\"\"Positive Tool Sampler: Wrapper to handle positive dataset augmentation with additional arguments.\"\"\"\n", + " try:\n", + " if use_default:\n", + " input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n", + " if not Path(input_csv_path).exists():\n", + " return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n", + " elif csv_file is not None:\n", + " input_csv_path = csv_file.name\n", + " else:\n", + " return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n", + "\n", + " # Call the positive sampler function with additional arguments\n", + " augmented_df = positive_sampler(\n", + " optional_path=input_csv_path, \n", + " size=size, \n", + " random=random, \n", + " seed=seed, \n", + " full=full\n", + " )\n", + " output_csv_path = \"positive_augmented_dataset.csv\"\n", + " augmented_df.to_csv(output_csv_path, index=False)\n", + "\n", + " return output_csv_path, augmented_df.head(), gr.update(visible=True)\n", + "\n", + " except Exception as e:\n", + " return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n", + "\n", + "\n", + "def reset_output():\n", + " \"\"\"Resets the output fields by returning None and hiding the DataFrame.\"\"\"\n", + " return None, None, gr.update(visible=False)\n", + "\n", + "with gr.Blocks(css=f\"\"\"\n", + " .gradio-container {{\n", + " font-family: Arial, sans-serif;\n", + " max-width: 900px;\n", + " margin: auto;\n", + " }}\n", + " h1 {{\n", + " text-align: center;\n", + " color: white;\n", + " font-size: 60px;\n", + " margin-bottom: 0px;\n", + " }}\n", + " h2 {{\n", + " text-align: center;\n", + " color: #ff0000;\n", + " font-size: 16px;\n", + " font-weight: normal;\n", + " margin-top: 0px;\n", + " }}\n", + " .title {{\n", + " text-align: center;\n", + " font-size: 40px;\n", + " margin-top: 30px;\n", + " margin-bottom: 20px;\n", + " }}\n", + " .title .positive {{\n", + " color: #ff0000;\n", + " }}\n", + " .title .negative {{\n", + " color: #ff0000;\n", + " }}\n", + " .title .tool {{\n", + " color: white;\n", + " }}\n", + " .title .sampler {{\n", + " color: #ff0000;\n", + " }}\n", + " .description {{\n", + " text-align: center;\n", + " margin-bottom: 20px;\n", + " }}\n", + " #submit-button {{\n", + " background-color: #ff0000;\n", + " color: white;\n", + " font-size: 16px;\n", + " border: none;\n", + " border-radius: 5px;\n", + " padding: 10px 20px;\n", + " }}\n", + " #reset-button {{\n", + " background-color: #d3d3d3;\n", + " color: black;\n", + " font-size: 16px;\n", + " border: none;\n", + " border-radius: 5px;\n", + " padding: 10px 20px;\n", + " }}\n", + "\"\"\") as app:\n", + " # Main Title Section\n", + " gr.Markdown(\"\"\"\n", + "

ENTC

\n", + "

Entrepreneurship and Technology Commercialization ยท EPFL

\n", + " \"\"\")\n", + "\n", + " # Positive Tool Sampler Section\n", + " gr.Markdown(\"\"\"\n", + "
\n", + " Positive\n", + " Tool\n", + " Sampler\n", + "
\n", + " \"\"\")\n", + "\n", + " gr.Markdown(\"\"\"\n", + "

\n", + " This tool takes a list of DOIs and augments them using the OpenAlex API.\n", + " It is designed to complement the Negative Tool Sampler, enabling the creation of complete datasets.\n", + "

\n", + " \"\"\")\n", + "\n", + " with gr.Group():\n", + " with gr.Row():\n", + " pos_use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n", + " pos_csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n", + "\n", + " with gr.Row():\n", + " size_input = gr.Number(label=\"Number of Samples\", value=10, info=\"Specify the number of samples to generate.\")\n", + " random_input = gr.Checkbox(label=\"Sample Randomly\", value=True, info=\"Whether to sample randomly.\")\n", + " seed_input = gr.Number(label=\"Random Seed\", value=42, info=\"Random seed for reproducibility.\")\n", + " full_input = gr.Checkbox(label=\"Full Dataset Mode\", value=False, info=\"Indicate whether to use the full dataset.\")\n", + "\n", + " with gr.Group():\n", + " pos_output_file = gr.File(label=\"Download Augmented Dataset\")\n", + " pos_dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n", + " with gr.Row():\n", + " pos_submit_button = gr.Button(\"Submit ๐Ÿš€\", elem_id=\"submit-button\")\n", + " pos_reset_button = gr.Button(\"Reset ๐Ÿ”„\", elem_id=\"reset-button\")\n", + "\n", + " # Button Actions\n", + " pos_submit_button.click(\n", + " positive_sampler_interface,\n", + " inputs=[pos_use_default_checkbox, pos_csv_file_input, size_input, random_input, seed_input, full_input],\n", + " outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n", + " )\n", + "\n", + " pos_reset_button.click(\n", + " reset_output,\n", + " inputs=[],\n", + " outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n", + " )\n", + "\n", + " # Toggle File Input\n", + " def toggle_pos_csv_input(use_default):\n", + " return gr.update(visible=not use_default)\n", + "\n", + " pos_use_default_checkbox.change(\n", + " toggle_pos_csv_input,\n", + " inputs=[pos_use_default_checkbox],\n", + " outputs=[pos_csv_file_input]\n", + " )\n", + "\n", + " # Negative Tool Sampler Section\n", + " gr.Markdown(\"\"\"\n", + "
\n", + " Negative\n", + " Tool\n", + " Sampler\n", + "
\n", + " \"\"\")\n", + "\n", + " gr.Markdown(\"\"\"\n", + "

\n", + " This tool generates datasets by creating negative samples from positive matches between preprints and articles.\n", + " Customize the difficulty and the augmentation factor to meet your needs.\n", + "

\n", + " \"\"\")\n", + "\n", + " with gr.Group():\n", + " with gr.Row():\n", + " factor_input = gr.Number(\n", + " label=\"Factor (int)\", value=1, info=\"Specify the number of negative samples per positive sample.\"\n", + " )\n", + " type_dropdown = gr.Dropdown(\n", + " [\"random\", \"similar topics\", \"overlapping authors\", \"random authors\", \"fuzzed title\"],\n", + " label=\"Select Difficulty or Augmentation Type\"\n", + " )\n", + " with gr.Row():\n", + " use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n", + " csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n", + "\n", + " with gr.Group():\n", + " output_file = gr.File(label=\"Download Augmented Dataset\")\n", + " dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n", + " with gr.Row():\n", + " submit_button = gr.Button(\"Submit ๐Ÿš€\", elem_id=\"submit-button\")\n", + " reset_button = gr.Button(\"Reset ๐Ÿ”„\", elem_id=\"reset-button\")\n", + "\n", + " # Button Actions\n", + " submit_button.click(\n", + " augment_interface,\n", + " inputs=[factor_input, type_dropdown, use_default_checkbox, csv_file_input],\n", + " outputs=[output_file, dataset_preview, dataset_preview]\n", + " )\n", + "\n", + " reset_button.click(\n", + " reset_output,\n", + " inputs=[],\n", + " outputs=[output_file, dataset_preview, dataset_preview]\n", + " )\n", + "\n", + " # Toggle File Input\n", + " def toggle_csv_input(use_default):\n", + " return gr.update(visible=not use_default)\n", + "\n", + " use_default_checkbox.change(\n", + " toggle_csv_input,\n", + " inputs=[use_default_checkbox],\n", + " outputs=[csv_file_input]\n", + " )\n", + "\n", + "# Launch the app\n", + "if __name__ == \"__main__\":\n", + " app.launch(share=True)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "marple", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/create_negative_samples.py b/create_negative_samples.py new file mode 100644 index 0000000000000000000000000000000000000000..67cc9e2cc95b7d5d1200ebc431b43c6e3975d13f --- /dev/null +++ b/create_negative_samples.py @@ -0,0 +1,54 @@ +from src.dataset.GoodDataset import * +from src.dataset.NegativeSampler import * +import argparse +import os + +def main(config): + """ + Main function to process the dataset and save it as a CSV file. + Args: + config: Namespace object containing the script arguments. + """ + dataset = AugmentedDataset() + dataset.load(config.input) + + sampler = NegativeSampler(dataset) + sampler.create_negative_samples(config) + + + print(custom_struct_to_df(dataset.negative_samples).head()) + custom_struct_to_df(dataset.positive_samples).to_csv('./data/pos.csv', index=False) + custom_struct_to_df(dataset.negative_samples).to_csv('./data/neg.csv', index=False) + print(len(dataset.positive_samples)) + print(len(dataset.negative_samples)) + + +if __name__ == "__main__": + # Parse command-line arguments + from src.utils.io_utils import PROJECT_ROOT + parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.") + + + parser.add_argument("-i", "--input", type=str, default=os.path.join(PROJECT_ROOT, "data/positive_samples.pkl"), help="Input file path to load the positive samples.") + parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/negative_samples.pkl"), help="Output file path to save the negative samples.") + + parser.add_argument("-s", "--seed", type=int, default=42, help="Random seed for reproducibility.") + + parser.add_argument("-r", "--random", action='store_true', help="Utilization of `sample_random`") + parser.add_argument("-f", "--fuzz_title", action='store_true', help="Utilization of `fuzz_title`") + parser.add_argument("-ra", "--replace_auth", action='store_true', help="Utilization of `sample_authors_overlap_random`") + parser.add_argument("-oa", "--overlap_auth", action='store_true', help="Utilization of `sample_authors_overlap`") + parser.add_argument("-ot", "--overlap_topic", action='store_true', help="Utilization of `sample_similar_topic`") + + parser.add_argument("--factor_max", type=int, default=4, help="Maximum number of negative samples to generate per positive sample.") + parser.add_argument("--authors_to_consider", type=int, default=1, help="Number of authors to consider when overlapping authors.") + parser.add_argument("--overlapping_authors", type=int, default=1, help="Minimum number of overlapping authors required.") + parser.add_argument("--fuzz_count", type=int, default=-1, help="Number of words to replace when fuzzing titles.") + + # Parse the arguments and pass to the main function + config = parser.parse_args() + if config.overlap_auth and config.overlap_topic: + parser.error("Only one of --overlap_auth and --overlap_topic can be used.") + if not (config.overlap_auth or config.overlap_topic or config.random): + parser.error("At least one of --overlap_auth, --overlap_topic, or --random must be specified.") + main(config) diff --git a/data/crossref-preprint-article-relationships-Aug-2023.csv b/data/crossref-preprint-article-relationships-Aug-2023.csv new file mode 100644 index 0000000000000000000000000000000000000000..30a55f1c612cc313f0c232812490e9f148d9fd28 --- /dev/null +++ b/data/crossref-preprint-article-relationships-Aug-2023.csv @@ -0,0 +1,1001 @@ +preprint_doi,article_doi,deposited_by_article_publisher,deposited_by_preprint_publisher,matching_confidence_score +10.5194/wcd-2021-52,10.5194/wcd-2-1245-2021,True,True,0.9919484702093396 +10.5194/tc-2020-96,10.5194/tc-15-1277-2021,True,True,1.0 +10.1101/001586,10.1016/j.bica.2014.02.003,False,True,1.0 +10.2196/preprints.23492,10.2196/23492,False,True,1.0 +10.5194/acpd-9-11659-2009,10.5194/acp-9-9349-2009,True,True,1.0 +10.31235/osf.io/nj43g,10.31014/aior.1991.04.01.262,False,True,0.9456161616161616 +10.2196/preprints.16461,10.2196/16461,False,True,1.0 +10.5194/acpd-10-523-2010,10.5194/acp-10-4699-2010,True,True,1.0 +10.20944/preprints201910.0177.v1,10.3390/ma12223739,False,True, +10.20944/preprints201903.0234.v1,10.35513/21658005.2019.1.5,False,True,1.0 +10.31234/osf.io/6ythf,10.1017/s0140525x19002206,False,True,1.0 +10.2196/preprints.11905,10.2196/11905,False,True,1.0 +10.1101/665364,10.1016/j.dyepig.2019.107863,False,True,0.936810016689432 +10.1101/2020.06.10.20067116,10.1098/rspa.2019.0790,False,True,1.0 +10.2196/preprints.19048,10.2196/19048,False,True,1.0 +10.2196/preprints.29042,10.2196/29042,False,True,1.0 +10.21203/rs.2.14007/v2,10.1186/s12885-019-6361-2,False,True,1.0 +10.21203/rs.2.14007/v3,10.1186/s12885-019-6361-2,False,True,1.0 +10.21203/rs.2.14007/v1,10.1186/s12885-019-6361-2,False,True, +10.2196/preprints.27257,10.2196/27257,False,True,1.0 +10.5194/hessd-7-7121-2010,10.5194/hess-15-197-2011,True,True,1.0 +10.20944/preprints201801.0038.v1,10.3390/e20030160,False,True,1.0 +10.20944/preprints201810.0314.v1,10.3390/v10110603,False,True,0.9962962962962963 +10.1101/2020.03.21.001347,10.21914/anziamj.v61i0.15040,False,True,0.9722222222222224 +10.1101/853283,10.1016/j.brainres.2019.146627,False,True,1.0 +10.2196/preprints.19021,10.2196/19021,False,True,1.0 +10.5194/gmd-2019-113,10.5194/gmd-13-4845-2020,True,True,1.0 +10.20944/preprints201711.0016.v1,10.3390/sym9120292,False,True,0.9916666666666668 +10.5194/osd-12-1567-2015,10.5194/os-12-39-2016,True,True,1.0 +10.20944/preprints201807.0566.v1,10.3390/sym10100451,False,True, +10.5194/hessd-5-1371-2008,10.5194/hess-14-2243-2010,True,True,0.8916666666666666 +10.31219/osf.io/s93tx,10.1080/19419899.2021.1875595,False,True,1.0 +10.31730/osf.io/cxety,10.35409/ijbmer.2019.2421,False,True,0.9907407407407408 +10.1101/675090,10.1371/journal.pone.0219107,False,True,1.0 +10.20944/preprints202101.0270.v1,10.3390/s21051871,False,True,0.9498525073746312 +10.5194/acp-2020-1010,10.5194/acp-21-9585-2021,True,True,0.998531571218796 +10.31219/osf.io/j28d4,10.31014/aior.1992.03.04.310,False,True, +10.31222/osf.io/c5xu8,10.31014/aior.1992.03.04.310,False,True,0.8857142857142857 +10.5194/os-2016-41,10.5194/os-12-1279-2016,True,True,1.0 +10.1101/790352,10.1016/j.neuroscience.2020.02.016,False,True,0.9743589743589745 +10.2196/preprints.10722,10.2196/10722,False,True,1.0 +10.20944/preprints201710.0189.v1,10.3390/soc8010005,False,True,1.0 +10.2196/preprints.17323,10.2196/17323,False,True,1.0 +10.20944/preprints202005.0002.v1,10.3390/app10113953,False,True,1.0 +10.2196/preprints.6578,10.2196/pediatrics.6578,False,True,1.0 +10.21034/dp.44,10.1016/s0927-5398(01)00040-8,False,True, +10.20944/preprints201608.0072.v1,10.3390/fermentation2030016,False,True,1.0 +10.5194/esurf-2021-105,10.5194/esurf-10-875-2022,True,True,0.9392265193370166 +10.20944/preprints201809.0407.v1,10.3390/sym10100514,False,True,1.0 +10.20944/preprints201807.0609.v1,10.3390/en11082171,False,True,0.9858156028368796 +10.20944/preprints201806.0018.v1,10.3390/su10072312,False,True,0.978494623655914 +10.5194/bg-2021-170,10.5194/bg-18-6167-2021,True,True,0.9986824769433466 +10.21203/rs.2.454/v1,10.1186/s13063-019-3712-x,False,True,0.9910714285714284 +10.21203/rs.2.454/v2,10.1186/s13063-019-3712-x,False,True,0.9910714285714284 +10.1101/2020.02.04.934588,10.1016/j.cnsns.2020.105373,False,True,0.9941520467836256 +10.31235/osf.io/9uw6j,10.1017/aap.2019.4,False,True,1.0 +10.20944/preprints201802.0123.v1,10.3390/su10040947,False,True,0.9989350372736956 +10.31219/osf.io/nvz85,10.13189/ujer.2020.082273,False,True,0.975438596491228 +10.21203/rs.3.rs-68784/v2,10.1186/s13018-020-02188-2,False,True,1.0 +10.21203/rs.3.rs-68784/v1,10.1186/s13018-020-02188-2,False,True,1.0 +10.5194/angeo-2021-38,10.5194/angeo-39-1005-2021,True,True,0.996376811594203 +10.1101/213397,10.1093/molbev/msy059,False,True,0.9847494553376906 +10.21203/rs.3.rs-122948/v1,10.1186/s12879-021-05787-4,False,True,1.0 +10.1101/468959,10.1080/1062936x.2012.742136,False,True, +10.2196/preprints.10213,10.2196/10213,False,True,0.9743589743589745 +10.20944/preprints201810.0228.v1,10.3390/children5110151,False,True,1.0 +10.21203/rs.3.rs-47855/v2,10.1186/s12879-021-05889-z,False,True,1.0 +10.21203/rs.3.rs-47855/v1,10.1186/s12879-021-05889-z,False,True,0.9910824834496332 +10.1002/essoar.10511860.1,10.1007/s11356-022-22561-4,False,True,1.0 +10.1101/139642,10.1371/journal.pone.0192081,False,True,0.9770609318996416 +10.2196/preprints.39166,10.2196/39166,False,True,1.0 +10.20944/preprints202212.0232.v1,10.3390/v15020406,False,True,1.0 +10.5194/amt-2022-196,10.5194/amt-16-707-2023,True,True,1.0 +10.2196/preprints.19397,10.2196/19397,False,True,1.0 +10.1101/496752,10.1016/j.optom.2019.10.001,False,True,0.9983660130718954 +10.1101/261370,10.1167/18.6.9,False,True,1.0 +10.5194/amt-2017-92,10.5194/amt-11-17-2018,True,True,1.0 +10.20944/preprints202102.0387.v1,10.3390/foods10030678,False,True,1.0 +10.20944/preprints201910.0017.v1,10.3390/sym11111390,False,True,1.0 +10.20944/preprints201707.0061.v1,10.3390/rel8080155,False,True,0.9885057471264368 +10.5194/wes-2018-35,10.5194/wes-3-615-2018,True,True,0.9987029831387808 +10.31235/osf.io/t25hr,10.18408/ahuri-7115001,False,True,1.0 +10.5194/se-2019-99,10.5194/se-11-241-2020,True,True,0.9732770745428972 +10.5194/amt-2021-113,10.5194/amt-14-6379-2021,True,True,0.9866666666666668 +10.2196/preprints.17782,10.2196/17782,False,True,1.0 +10.21203/rs.2.12126/v3,10.1186/s12898-019-0270-8,False,True,0.9933333333333332 +10.21203/rs.2.12126/v1,10.1186/s12898-019-0270-8,False,True,0.9909178743961352 +10.21203/rs.2.12126/v2,10.1186/s12898-019-0270-8,False,True,0.9909178743961352 +10.31235/osf.io/8k7sp,10.4303/jdar/235992,False,True,1.0 +10.2196/preprints.23357,10.2196/23357,False,True,1.0 +10.5194/os-2022-15,10.5194/os-18-1163-2022,True,True,0.999250936329588 +10.5194/npgd-1-1133-2014,10.5194/npg-22-53-2015,True,False,1.0 +10.2196/preprints.46339,10.2196/46339,False,True,1.0 +10.2196/preprints.38176,10.2196/38176,False,True,1.0 +10.2196/preprints.44602,10.2196/44602,False,True,1.0 +10.2196/preprints.20571,10.2196/20571,False,True,1.0 +10.2196/preprints.12603,10.2196/12603,False,True,1.0 +10.20944/preprints201809.0144.v1,10.3390/resources7040076,False,True,0.9727626459143968 +10.5194/cp-2017-68,10.5194/cp-15-377-2019,True,True,1.0 +10.1101/402750,10.1098/rsif.2018.0792,False,True,1.0 +10.31234/osf.io/5bm8r,10.1163/22105832-00902006,False,True,1.0 +10.21203/rs.3.rs-2156656/v1,10.1038/s41388-023-02692-9,False,True,1.0 +10.5194/os-2020-51,10.5194/os-17-131-2021,True,True,0.9956140350877192 +10.20944/preprints202102.0336.v1,10.3390/app11062801,False,True,0.9324894514767932 +10.2196/preprints.16294,10.2196/16294,False,True,0.9784172661870504 +10.20944/preprints201809.0477.v1,10.3390/medicina54060099,False,True,0.9989417989417988 +10.2196/preprints.23400,10.2196/23400,False,True,1.0 +10.5194/hgss-2023-1,10.5194/hgss-14-61-2023,True,True,1.0 +10.26434/chemrxiv-2022-k7k0h-v6,10.1021/acs.jpcb.2c03638,False,True,0.9807692307692308 +10.5194/amt-2022-263,10.5194/amt-16-355-2023,True,True,0.9877675840978594 +10.1101/446310,10.1371/journal.pntd.0006927,False,True,0.9273689273689274 +10.1101/537035,10.1021/acs.jctc.0c00476,False,True,1.0 +10.21203/rs.3.rs-39716/v1,10.1186/s12913-021-06123-x,False,True,0.9987029831387808 +10.21203/rs.3.rs-39716/v2,10.1186/s12913-021-06123-x,False,True,1.0 +10.5194/cp-2021-15,10.5194/cp-17-2327-2021,True,True,0.8992248062015503 +10.21203/rs.3.rs-114221/v1,10.1186/s12889-021-10594-2,False,True,1.0 +10.21203/rs.3.rs-2014302/v1,10.1038/s41372-023-01642-3,False,True,1.0 +10.1101/2021.10.21.465319,10.1002/oby.23441,False,True,0.959660297239915 +10.5194/acpd-14-14637-2014,10.5194/acp-14-12683-2014,True,True,1.0 +10.32942/osf.io/mxg6q,10.1111/1440-1703.12294,False,True,1.0 +10.5194/acpd-11-813-2011,10.5194/acp-11-8017-2011,True,True,0.938818565400844 +10.5194/acp-2016-365,10.5194/acp-17-575-2017,True,True,0.9300395256916996 +10.5194/amtd-8-8385-2015,10.5194/amt-9-359-2016,True,True,0.9983660130718954 +10.5194/acpd-8-15101-2008,10.5194/acp-9-1639-2009,True,True,1.0 +10.5194/acp-2018-739,10.5194/acp-18-17225-2018,True,True,0.9977324263038548 +10.5194/amt-2016-398,10.5194/amt-10-1911-2017,True,True,1.0 +10.5194/cpd-9-1703-2013,10.5194/cp-9-1749-2013,True,True,1.0 +10.5194/bg-2017-74,10.5194/bg-14-3883-2017,True,True,1.0 +10.5194/acp-2016-178,10.5194/acp-16-11617-2016,True,True,0.9722222222222224 +10.5194/bgd-12-3211-2015,10.5194/bg-12-3225-2015,True,True,1.0 +10.5194/tc-2016-111,10.5194/tc-10-2317-2016,True,True,1.0 +10.5194/acpd-10-21931-2010,10.5194/acp-11-5603-2011,True,True,1.0 +10.1101/2020.02.29.970913,10.3389/fmicb.2020.01037,False,True,1.0 +10.5194/tc-2017-2,10.5194/tc-11-2265-2017,True,True,0.943502824858757 +10.21203/rs.3.rs-35627/v2,10.1186/s12889-020-09979-6,False,True,0.9965277777777776 +10.21203/rs.3.rs-35627/v3,10.1186/s12889-020-09979-6,False,True,1.0 +10.21203/rs.3.rs-35627/v4,10.1186/s12889-020-09979-6,False,True,1.0 +10.21203/rs.3.rs-35627/v5,10.1186/s12889-020-09979-6,False,True,0.9965277777777776 +10.21203/rs.3.rs-35627/v1,10.1186/s12889-020-09979-6,False,True,0.9910873440285204 +10.5194/acpd-15-27501-2015,10.5194/acp-16-2477-2016,True,True,0.9437328918048964 +10.5194/acpd-11-11809-2011,10.5194/acp-12-3627-2012,True,False,1.0 +10.5194/acpd-14-9801-2014,10.5194/acp-14-9917-2014,True,True,0.9572649572649572 +10.5194/bgd-11-14269-2014,10.5194/bg-12-1131-2015,True,False,0.9866666666666668 +10.5194/amt-2016-87,10.5194/amt-9-3769-2016,True,True,0.9743589743589745 +10.5194/acpd-15-34361-2015,10.5194/acp-16-10501-2016,True,True,0.954940867939686 +10.5194/acpd-13-2125-2013,10.5194/acp-13-11089-2013,True,True,1.0 +10.5194/acp-2016-806,10.5194/acp-18-2243-2018,True,True,0.9950980392156864 +10.5194/cp-2017-57,10.5194/cp-13-1539-2017,True,True,1.0 +10.5194/tc-2016-29,10.5194/tc-10-2241-2016,True,True,0.9985693848354792 +10.5194/acpd-9-15747-2009,10.5194/acp-9-8857-2009,True,True,1.0 +10.26434/chemrxiv.7464803.v1,10.1021/acs.langmuir.9b02574,False,True,0.954861111111111 +10.5194/nhessd-2-4685-2014,10.5194/nhess-15-109-2015,True,True,0.9206349206349206 +10.31234/osf.io/5v4wt,10.1016/j.beproc.2017.04.017,False,True, +10.5194/hessd-12-9003-2015,10.5194/hess-20-605-2016,True,False,0.9679291983488598 +10.5194/acp-2016-692,10.5194/acp-17-3279-2017,True,True,0.9863013698630138 +10.5194/acpd-10-7469-2010,10.5194/acp-10-9017-2010,True,True,1.0 +10.1101/115253,10.1093/cercor/bhx259,False,True,1.0 +10.5194/acpd-11-25709-2011,10.5194/acp-11-12959-2011,True,True,1.0 +10.5194/acpd-12-28765-2012,10.5194/acp-13-2857-2013,True,True,1.0 +10.5194/gmdd-8-5315-2015,10.5194/gmd-9-17-2016,True,True,1.0 +10.5194/gmd-2016-63,10.5194/gmd-9-3199-2016,True,True,1.0 +10.5194/acp-2017-319,10.5194/acp-17-13699-2017,True,True,0.8735930735930736 +10.5194/bgd-10-2415-2013,10.5194/bg-10-5171-2013,True,True,0.9637681159420288 +10.5194/bgd-10-9315-2013,10.5194/bg-10-7347-2013,True,True,0.9975669099756692 +10.5194/soild-2-29-2015,10.5194/soil-1-475-2015,True,True,0.9957446808510638 +10.5194/se-2017-18,10.5194/se-8-955-2017,True,True,1.0 +10.1101/012195,10.1007/s10827-015-0574-4,False,True,0.988095238095238 +10.5194/gmdd-6-2491-2013,10.5194/gmd-7-1183-2014,True,True,0.993127147766323 +10.5194/gmd-2017-293,10.5194/gmd-11-1971-2018,True,True,0.998272884283247 +10.5194/gmdd-5-1381-2012,10.5194/gmd-6-57-2013,True,True,1.0 +10.1101/2021.04.28.441869,10.1016/j.ymthe.2022.01.030,False,True,1.0 +10.1101/568790,10.3389/fimmu.2019.01066,False,True,0.9709639953542392 +10.1101/560144,10.26508/lsa.201900358,False,True,1.0 +10.1101/2020.06.22.164814,10.1152/jn.00110.2021,False,True,1.0 +10.5194/amtd-7-11345-2014,10.5194/amt-8-2491-2015,True,True,1.0 +10.20944/preprints201803.0185.v1,10.3390/ijms19051364,False,True,0.9803921568627452 +10.5194/acpd-8-3895-2008,10.5194/acp-8-4655-2008,True,True,1.0 +10.5194/hess-2016-400,10.5194/hess-20-5049-2016,True,True,1.0 +10.5194/soild-2-647-2015,10.5194/soil-2-1-2016,True,True,1.0 +10.5194/nhess-2016-210,10.5194/nhess-16-2347-2016,True,True,0.9388560157790928 +10.5194/gmdd-6-1085-2013,10.5194/gmd-6-1641-2013,True,True,1.0 +10.5194/gmd-2016-114,10.5194/gmd-9-3605-2016,True,True,0.933719101675758 +10.5194/acpd-14-19837-2014,10.5194/acp-15-913-2015,True,True,0.9904761904761904 +10.5194/amtd-8-10755-2015,10.5194/amt-9-1613-2016,True,True,0.9866666666666668 +10.5194/acpd-15-931-2015,10.5194/acp-15-6535-2015,True,True,0.9878542510121456 +10.5194/acpd-14-2277-2014,10.5194/acp-14-6557-2014,True,True,0.998003992015968 +10.5194/acpd-12-16647-2012,10.5194/acp-12-11795-2012,True,True,1.0 +10.20944/preprints201805.0070.v1,10.3390/diagnostics8020041,False,True,0.9658260233918128 +10.20944/preprints202105.0096.v1,10.3390/nu13061875,False,True,0.993992722253592 +10.5194/acp-2018-37,10.5194/acp-18-6761-2018,True,True,1.0 +10.5194/amtd-5-2111-2012,10.5194/amt-5-1719-2012,True,True,1.0 +10.26434/chemrxiv.8220599.v1,10.1021/acs.chemrestox.9b00255,False,True,0.996031746031746 +10.5194/osd-11-1719-2014,10.5194/os-11-269-2015,True,True,1.0 +10.5194/osd-11-693-2014,10.5194/os-10-587-2014,True,True,1.0 +10.5194/cp-2017-49,10.5194/cp-14-157-2018,True,True,0.987468671679198 +10.5194/acpd-11-4533-2011,10.5194/acp-11-6721-2011,True,False,1.0 +10.5194/sed-6-2567-2014,10.5194/se-5-1169-2014,True,True,0.9886264216972878 +10.5194/cpd-8-1523-2012,10.5194/cp-8-1801-2012,True,True,0.9982547993019196 +10.5194/gmd-2016-37,10.5194/gmd-9-3111-2016,True,True,0.9759036144578314 +10.5194/soil-2016-63,10.5194/soil-3-67-2017,True,True,0.9228395061728396 +10.5194/acp-2016-7,10.5194/acp-16-7653-2016,True,True,0.978593272171254 +10.1101/165357,10.1371/journal.pcbi.1005868,False,True,1.0 +10.5194/nhess-2016-66,10.5194/nhess-16-2247-2016,True,True,1.0 +10.2196/preprints.9154,10.2196/resprot.9154,False,True,1.0 +10.1101/568212,10.1098/rspb.2019.1818,False,True,1.0 +10.5194/gmdd-7-931-2014,10.5194/gmd-7-2411-2014,True,True,0.9975669099756692 +10.1101/2021.06.19.449118,10.1158/2767-9764.crc-22-0003,False,True, +10.2196/preprints.20509,10.2196/20509,False,True,1.0 +10.5194/osd-10-691-2013,10.5194/os-9-885-2013,True,True,0.9607843137254902 +10.20944/preprints202007.0409.v1,10.3390/en13174331,False,True,1.0 +10.5194/se-2016-55,10.5194/se-7-1085-2016,True,True,0.9526748971193416 +10.5194/nhessd-1-3891-2013,10.5194/nhess-14-1257-2014,True,True,0.930117899249732 +10.5194/acpd-13-32291-2013,10.5194/acp-14-7485-2014,True,True,1.0 +10.5194/acpd-6-3099-2006,10.5194/acp-6-3243-2006,True,True,0.9629629629629628 +10.26434/chemrxiv-2022-dnl9p,10.1021/acs.inorgchem.2c01171,False,True,1.0 +10.5194/bgd-10-19311-2013,10.5194/bg-11-4015-2014,True,True,1.0 +10.5194/bgd-10-7013-2013,10.5194/bg-10-6807-2013,True,True, +10.5194/bgd-10-2305-2013,10.5194/bg-10-7263-2013,True,True,1.0 +10.1101/2022.01.11.475674,10.1016/j.nbd.2022.105879,False,True,1.0 +10.1101/517243,10.1016/j.neuroimage.2019.116175,False,True,1.0 +10.20944/preprints201808.0196.v1,10.3390/e20110840,False,True,0.949874686716792 +10.5194/acpd-14-24573-2014,10.5194/acp-15-6047-2015,True,True,0.9977324263038548 +10.5194/hessd-9-2717-2012,10.5194/hess-16-3817-2012,True,True,0.8919753086419754 +10.2196/preprints.42403,10.2196/42403,False,True,1.0 +10.2196/preprints.15105,10.2196/15105,False,True,0.9085858585858584 +10.5194/acpd-12-14115-2012,10.5194/acp-12-11037-2012,True,True,1.0 +10.5194/acpd-12-6593-2012,10.5194/acp-12-10331-2012,True,True,0.9982547993019196 +10.5194/acpd-12-20007-2012,10.5194/acp-13-3849-2013,True,True,1.0 +10.2196/preprints.9966,10.2196/jmir.9966,False,True,1.0 +10.5194/osd-3-939-2006,10.5194/os-3-129-2007,True,True,1.0 +10.1101/2020.02.16.951954,10.15252/embj.2020104708,False,True,0.9681704260651628 +10.5194/bgd-8-941-2011,10.5194/bg-8-2523-2011,True,True,0.9915764139590856 +10.5194/bgd-8-7165-2011,10.5194/bg-8-3609-2011,True,True,1.0 +10.1101/791319,10.1523/jneurosci.2416-19.2020,False,True,0.990138067061144 +10.5194/tcd-8-5361-2014,10.5194/tc-9-103-2015,True,True,1.0 +10.5194/tcd-6-5119-2012,10.5194/tc-7-1139-2013,True,True,0.9551724137931036 +10.1101/2021.10.23.465582,10.1523/jneurosci.2145-21.2022,False,True,1.0 +10.5194/tcd-6-2265-2012,10.5194/tc-7-1-2013,True,True,0.989384288747346 +10.2196/preprints.13662,10.2196/13662,False,True,1.0 +10.31231/osf.io/3pxzd,10.1037/ccp0000092,False,True,0.9423740510697032 +10.5194/hessd-7-9173-2010,10.5194/hess-15-1339-2011,True,True,1.0 +10.5194/bgd-9-14291-2012,10.5194/bg-10-5079-2013,True,True,1.0 +10.5194/acpd-11-4631-2011,10.5194/acp-11-7629-2011,True,True,1.0 +10.2196/preprints.17542,10.2196/17542,False,True,1.0 +10.5194/hessd-7-621-2010,10.5194/hess-14-719-2010,True,True,1.0 +10.5194/acpd-12-1451-2012,10.5194/acp-12-5755-2012,True,True,0.998015873015873 +10.5194/gmdd-6-3655-2013,10.5194/gmd-6-2153-2013,True,True,0.9890611279972982 +10.1101/2021.07.14.452404,10.1111/2041-210x.13821,False,True,0.9930555555555556 +10.5194/bgd-10-17071-2013,10.5194/bg-11-3131-2014,True,True,0.99457111834962 +10.1101/600312,10.1111/tpj.14670,False,True,0.9085648148148148 +10.5194/cpd-7-775-2011,10.5194/cp-7-917-2011,True,True,1.0 +10.5194/acpd-13-27779-2013,10.5194/acp-14-2383-2014,True,True,0.9473684210526316 +10.5194/esdd-5-779-2014,10.5194/esd-5-423-2014,True,False,1.0 +10.21203/rs.2.11587/v2,10.1186/s12884-019-2590-2,False,True,1.0 +10.21203/rs.2.11587/v1,10.1186/s12884-019-2590-2,False,True,1.0 +10.21203/rs.2.11587/v4,10.1186/s12884-019-2590-2,False,True,1.0 +10.21203/rs.2.11587/v3,10.1186/s12884-019-2590-2,False,True,1.0 +10.21203/rs.2.11587/v5,10.1186/s12884-019-2590-2,False,True,1.0 +10.2196/preprints.12347,10.2196/12347,False,True,1.0 +10.5194/se-2016-11,10.5194/se-7-599-2016,True,True,0.9022946859903382 +10.5194/bg-2015-647,10.5194/bg-13-5511-2016,True,True,1.0 +10.1101/087577,10.1371/journal.pgen.1006793,False,True,0.912630579297246 +10.5194/acpd-9-13327-2009,10.5194/acp-9-8651-2009,True,True,1.0 +10.5194/hessd-6-4307-2009,10.5194/hess-13-2151-2009,True,True,1.0 +10.5194/acpd-11-4807-2011,10.5194/acp-11-6297-2011,True,True,0.9947916666666666 +10.5194/acpd-11-8337-2011,10.5194/acp-11-8415-2011,True,True,1.0 +10.1101/233924,10.1002/ece3.3872,False,True,0.9987129987129988 +10.21203/rs.3.rs-997649/v1,10.1007/s10637-022-01218-6,False,True,0.9977477477477475 +10.1101/359018,10.1152/jn.00601.2018,False,True,1.0 +10.1101/247189,10.1111/evo.13573,False,True,1.0 +10.5194/bgd-10-14093-2013,10.5194/bg-10-8223-2013,True,True,0.9901960784313726 +10.20944/preprints202009.0582.v1,10.3390/jmse8100756,False,True,1.0 +10.1101/370155,10.1371/journal.pone.0201329,False,True,0.9853249475890984 +10.2196/preprints.18338,10.2196/18338,False,True,1.0 +10.5194/acpd-11-13867-2011,10.5194/acp-11-10911-2011,True,True,1.0 +10.21203/rs.3.rs-61529/v2,10.1186/s13049-020-00818-6,False,True,0.98989898989899 +10.21203/rs.3.rs-61529/v1,10.1186/s13049-020-00818-6,False,True,1.0 +10.1101/868307,10.1093/cercor/bhaa146,False,True,1.0 +10.5194/acpd-11-9887-2011,10.5194/acp-11-11867-2011,True,True,0.9621952608794714 +10.5194/acpd-11-11649-2011,10.5194/acp-11-12751-2011,True,True,1.0 +10.5194/bgd-12-15495-2015,10.5194/bg-13-3619-2016,True,True,0.903925364758698 +10.5194/bgd-12-7705-2015,10.5194/bg-12-5277-2015,True,True,1.0 +10.1101/2020.04.14.041145,10.1158/1078-0432.ccr-20-1762,False,True,0.9487179487179488 +10.5194/acpd-10-24245-2010,10.5194/acp-11-767-2011,True,True,0.9993238674780256 +10.31234/osf.io/k4anx,10.1186/s12887-019-1818-7,False,True,1.0 +10.5194/bgd-10-19005-2013,10.5194/bg-11-2793-2014,True,True,1.0 +10.20944/preprints202004.0309.v1,10.3390/rs12111748,False,True,1.0 +10.5194/acpd-9-16715-2009,10.5194/acp-10-1269-2010,True,True,1.0 +10.5194/hessd-12-12615-2015,10.5194/hess-20-2691-2016,True,True, +10.5194/amtd-7-1917-2014,10.5194/amt-7-2097-2014,True,True,1.0 +10.5194/amtd-6-1771-2013,10.5194/amt-6-1903-2013,True,True,1.0 +10.5194/hessd-10-2373-2013,10.5194/hess-18-595-2014,True,True, +10.5194/acpd-13-2913-2013,10.5194/acp-13-6473-2013,True,True,0.9876543209876544 +10.5194/bgd-10-17043-2013,10.5194/bg-11-2519-2014,True,True, +10.5194/acpd-15-6125-2015,10.5194/acp-15-9003-2015,True,True,1.0 +10.5194/acpd-9-6397-2009,10.5194/acp-9-5093-2009,True,True,0.9914529914529916 +10.5194/acpd-10-23657-2010,10.5194/acp-11-1621-2011,True,True,1.0 +10.5194/bgd-11-10917-2014,10.5194/bg-11-7025-2014,True,True,0.9966329966329966 +10.5194/acpd-13-18951-2013,10.5194/acp-13-11169-2013,True,True,1.0 +10.5194/hessd-4-3087-2007,10.5194/hess-12-405-2008,True,True,1.0 +10.5194/bgd-9-19121-2012,10.5194/bg-10-2315-2013,True,False,0.9976359338061466 +10.2196/preprints.11364,10.2196/11364,False,True,1.0 +10.5194/acpd-11-19011-2011,10.5194/acp-12-11085-2012,True,True, +10.5194/sed-5-257-2013,10.5194/se-4-255-2013,True,True,1.0 +10.31234/osf.io/84uqz,10.1111/psyp.14242,False,True,0.9692307692307692 +10.5194/esurfd-1-745-2013,10.5194/esurf-2-363-2014,True,True,1.0 +10.21203/rs.3.rs-474980/v1,10.1038/s42003-021-02885-6,False,True,0.998148148148148 +10.1101/2021.10.15.464543,10.1021/acs.jcim.1c01269,False,True,1.0 +10.5194/hessd-10-9847-2013,10.5194/hess-17-5213-2013,True,True,1.0 +10.5194/acpd-11-30757-2011,10.5194/acp-12-4885-2012,True,True,1.0 +10.5194/tcd-8-3367-2014,10.5194/tc-9-65-2015,True,True,1.0 +10.5194/acpd-14-19515-2014,10.5194/acp-15-99-2015,True,False,1.0 +10.5194/bgd-8-5849-2011,10.5194/bg-9-593-2012,True,True,1.0 +10.5194/bgd-10-17549-2013,10.5194/bg-11-4459-2014,True,True,1.0 +10.5194/acpd-14-4189-2014,10.5194/acp-14-7075-2014,True,True,1.0 +10.5194/hessd-9-5531-2012,10.5194/hess-16-3749-2012,True,False,0.9987325728770596 +10.5194/bgd-9-2153-2012,10.5194/bg-9-2301-2012,True,True,0.9831649831649832 +10.5194/acpd-9-5809-2009,10.5194/acp-9-6479-2009,True,False,1.0 +10.5194/cpd-9-1735-2013,10.5194/cp-9-1773-2013,True,True,0.9965277777777776 +10.5194/osd-7-995-2010,10.5194/os-7-175-2011,True,True,0.9966666666666668 +10.1101/116426,10.1088/1478-3975/aa6b67,False,True,1.0 +10.5194/tc-2016-199,10.5194/tc-11-47-2017,True,True,1.0 +10.5194/tcd-8-4823-2014,10.5194/tc-9-53-2015,True,True,1.0 +10.5194/cp-2016-131,10.5194/cp-13-1153-2017,True,True,1.0 +10.26434/chemrxiv.11514189.v1,10.1002/anie.201915493,False,True,0.9611046776853706 +10.26434/chemrxiv.11514189.v2,10.1002/anie.201915493,False,True,0.9611046776853706 +10.1101/2020.12.18.423427,10.1038/s41422-021-00495-9,False,True,0.9445194182036288 +10.5194/cpd-11-3277-2015,10.5194/cp-12-455-2016,True,True, +10.21203/rs.2.14334/v2,10.1186/s12909-019-1876-4,False,True,1.0 +10.21203/rs.2.14334/v3,10.1186/s12909-019-1876-4,False,True,1.0 +10.21203/rs.2.14334/v1,10.1186/s12909-019-1876-4,False,True,1.0 +10.1101/2021.04.26.441285,10.1021/acsnano.1c06488,False,True,1.0 +10.5194/esd-2020-80,10.5194/esd-12-367-2021,True,True,0.9983249581239532 +10.20944/preprints201810.0612.v1,10.3390/electronics7120347,False,True,0.925925925925926 +10.1101/209718,10.1038/s41592-018-0002-6,False,True,0.9055876685934487 +10.5194/cpd-11-3143-2015,10.5194/cp-12-819-2016,True,True,0.9324444444444444 +10.5194/cpd-9-5837-2013,10.5194/cp-10-759-2014,True,True,0.9959349593495936 +10.2196/preprints.9498,10.2196/jmir.9498,False,True,1.0 +10.2196/preprints.9498.a,10.2196/jmir.9498,False,True,1.0 +10.5194/cp-2018-60,10.5194/cp-15-1063-2019,True,True,0.8722741433021807 +10.5194/acp-2022-387,10.5194/acp-22-13897-2022,True,True,0.9963099630996308 +10.5194/acp-2021-870,10.5194/acp-22-12961-2022,True,True,0.9591397849462364 +10.5194/bg-2019-145,10.5194/bg-16-3377-2019,True,True,1.0 +10.5194/tc-2018-131,10.5194/tc-13-219-2019,True,True,0.9820193637621024 +10.5194/gmdd-8-7063-2015,10.5194/gmd-9-1293-2016,True,True,0.91999806765503 +10.1101/223248,10.1093/jnci/djy081,False,True,0.9974747474747474 +10.5194/hess-2021-2,10.5194/hess-25-5749-2021,True,True,0.9780786589297228 +10.5194/hessd-10-14705-2013,10.5194/hess-19-389-2015,True,True,0.9164912280701756 +10.5194/hessd-9-10563-2012,10.5194/hess-17-817-2013,True,False,1.0 +10.5194/hessd-11-9183-2014,10.5194/hess-19-1247-2015,True,False,1.0 +10.1101/542282,10.1093/jxb/erz182,False,True,0.971326164874552 +10.5194/essd-2022-239,10.5194/essd-15-1675-2023,True,True,0.9861239592969472 +10.1101/2020.08.04.237156,10.1002/jev2.12079,False,True,0.9973544973544972 +10.1101/198671,10.1038/s41593-019-0359-6,False,True,1.0 +10.1101/2020.07.13.200360,10.1371/journal.pone.0236612,False,True,0.9743589743589745 +10.1101/462861,10.1371/journal.pone.0207555,False,True,0.9920496894409938 +10.20944/preprints201608.0123.v1,10.3390/s16081290,False,True,0.9979423868312756 +10.5194/acpd-15-14889-2015,10.5194/acp-15-11165-2015,True,True,1.0 +10.26434/chemrxiv-2021-70pvw,10.1021/acs.chemmater.1c04167,False,True,1.0 +10.5194/hessd-8-5319-2011,10.5194/hess-15-2839-2011,True,True,1.0 +10.5194/hessd-12-1809-2015,10.5194/hess-20-3873-2016,True,True,0.9321789321789322 +10.5194/hess-2022-60,10.5194/hess-26-6399-2022,True,True,0.9314420803782508 +10.5194/hessd-2-2427-2005,10.5194/hess-10-535-2006,True,True,1.0 +10.5194/hessd-11-6881-2014,10.5194/hess-19-1225-2015,True,False,1.0 +10.5194/hess-2019-461,10.5194/hess-24-3015-2020,True,True,0.9784172661870504 +10.1101/2021.06.16.448617,10.1084/jem.20211112,False,True,0.96 +10.1101/2020.02.16.942904,10.3390/cancers12051171,False,True,0.9784172661870504 +10.5194/gmd-2022-173,10.5194/gmd-16-1617-2023,True,True,0.9973958333333334 +10.5194/hessd-8-9961-2011,10.5194/hess-16-1445-2012,True,False,1.0 +10.21034/sr.410,10.1086/666589,False,True,1.0 +10.5194/acpd-11-32601-2011,10.5194/acp-12-3273-2012,True,True,1.0 +10.5194/hess-2018-334,10.5194/hess-22-5987-2018,True,True,0.9670781893004116 +10.1101/2022.07.01.498411,10.7554/elife.81184,True,False,0.8990378213475783 +10.1101/374660,10.1016/j.celrep.2018.10.079,False,True,0.9166666666666666 +10.21203/rs.3.rs-87483/v1,10.1186/s40658-020-00350-7,False,True,1.0 +10.1101/2020.08.27.269647,10.1186/s40478-020-01068-4,False,True,0.9691358024691358 +10.1101/028886,10.1038/nature17661,False,True,0.992156862745098 +10.1101/2021.08.11.455980,10.7554/elife.83652,True,False,0.9791666666666666 +10.1101/596569,10.1016/j.celrep.2019.10.056,False,True,1.0 +10.5194/mr-2020-13,10.5194/mr-1-209-2020,True,True,0.9714285714285714 +10.26434/chemrxiv-2021-t1b6t,10.1021/jacs.2c03024,False,True, +10.5194/mr-2021-9,10.5194/mr-2-375-2021,True,True,1.0 +10.5194/mr-2020-5,10.5194/mr-1-59-2020,True,True,1.0 +10.5194/acpd-15-10899-2015,10.5194/acp-15-8751-2015,True,True,1.0 +10.1101/529156,10.1371/journal.pgen.1008458,False,True,0.9487922705314008 +10.1101/2021.09.14.460327,10.1523/eneuro.0373-21.2022,False,True,0.875 +10.5194/bg-2018-512,10.5194/bg-16-2635-2019,True,True,1.0 +10.5194/acp-2020-17,10.5194/acp-20-9281-2020,True,True,0.9971014492753624 +10.1101/2021.11.03.467174,10.7554/elife.75272,True,True,1.0 +10.26434/chemrxiv.7990910.v2,10.1021/acs.jcim.9b00325,False,True,0.98635477582846 +10.26434/chemrxiv.7990910.v1,10.1021/acs.jcim.9b00325,False,True,0.98635477582846 +10.21203/rs.3.rs-677091/v1,10.1038/s41467-021-26199-7,False,True,0.993103448275862 +10.1101/2020.04.02.022541,10.1021/acschembio.0c00348,False,True,1.0 +10.5194/wes-2021-156,10.5194/wes-7-2307-2022,True,True,1.0 +10.1101/050237,10.1371/journal.pone.0170622,False,True,0.9890453834115808 +10.5194/osd-12-135-2015,10.5194/os-11-629-2015,True,True,1.0 +10.1101/072470,10.1038/s41586-018-0124-0,False,True, +10.31234/osf.io/dbkj6,10.1111/bjso.12399,False,True,0.9777777777777776 +10.21203/rs.3.rs-136113/v1,10.1186/s13011-021-00358-x,False,True,1.0 +10.26434/chemrxiv.9756785.v1,10.1021/acsmedchemlett.9b00399,False,True,0.9969135802469136 +10.1101/2021.04.23.441115,10.7554/elife.69223,True,True,0.9528769841269842 +10.1101/328211,10.1007/s00339-019-2480-5,False,True,0.9629629629629628 +10.1101/2022.10.06.511106,10.7554/elife.83761,True,False,1.0 +10.1101/146852,10.1038/s41589-018-0013-8,False,True,0.9523809523809524 +10.31234/osf.io/vxa86,10.1037/met0000179,False,True,0.9985569985569984 +10.21203/rs.2.20459/v1,10.1186/s12870-020-2311-z,False,True,0.9845288326300984 +10.21203/rs.2.20459/v2,10.1186/s12870-020-2311-z,False,True,1.0 +10.21203/rs.2.20459/v3,10.1186/s12870-020-2311-z,False,True,1.0 +10.1101/122044,10.1371/journal.pcbi.1005890,False,True,1.0 +10.2196/preprints.14675,10.2196/14675,False,True,1.0 +10.32942/osf.io/s5dnr,10.1111/jeb.13728,False,True,0.942857142857143 +10.1101/2020.10.28.358846,10.1071/fp21337,False,True,0.93 +10.1101/2021.06.26.449853,10.1002/glia.24190,False,True,1.0 +10.1101/2021.04.13.439588,10.7554/elife.69377,True,True,0.9671445639187576 +10.5194/acp-2020-1041,10.5194/acp-21-9909-2021,True,True,0.9969418960244648 +10.5194/sed-4-1069-2012,10.5194/se-3-355-2012,True,True,1.0 +10.5194/hess-2021-41,10.5194/hess-25-4917-2021,True,True,0.9851387437594336 +10.21203/rs.3.rs-35889/v1,10.1186/s12883-020-01958-z,False,True,0.9662234998203376 +10.21203/rs.3.rs-35889/v2,10.1186/s12883-020-01958-z,False,True,1.0 +10.1101/2022.05.10.491316,10.7554/elife.78810,True,False,1.0 +10.2196/preprints.17997,10.2196/17997,False,True,1.0 +10.1101/317552,10.1038/s41396-018-0240-8,False,True, +10.1101/719922,10.7554/elife.88350,True,False, +10.2196/preprints.12957,10.2196/12957,False,True,1.0 +10.1101/322388,10.1016/j.celrep.2019.05.006,False,True,0.9878183831672204 +10.1101/431718,10.1016/j.neuroimage.2019.03.019,False,True,1.0 +10.21203/rs.3.rs-97961/v2,10.1186/s12933-021-01222-9,False,True,1.0 +10.21203/rs.3.rs-97961/v1,10.1186/s12933-021-01222-9,False,True,1.0 +10.5194/gmd-2019-295,10.5194/gmd-13-873-2020,True,True,0.988835725677831 +10.1101/294587,10.1016/j.eclinm.2019.06.003,False,True,0.9437857708706062 +10.1101/426957,10.1186/s12864-018-5299-0,False,True,1.0 +10.21203/rs.2.16448/v1,10.1186/s12864-020-6471-x,False,True,1.0 +10.21203/rs.2.16448/v2,10.1186/s12864-020-6471-x,False,True,1.0 +10.1101/2021.02.09.430442,10.1523/jneurosci.0556-21.2021,False,True,0.9092592592592592 +10.5194/essd-2020-16,10.5194/essd-12-1789-2020,True,True,0.9985693848354792 +10.21203/rs.3.rs-65516/v1,10.1186/s12864-021-07431-6,False,True,1.0 +10.21203/rs.3.rs-65516/v2,10.1186/s12864-021-07431-6,False,True,1.0 +10.1101/370874,10.1186/s41073-019-0069-3,False,True, +10.20944/preprints201908.0008.v1,10.3390/ma12182960,False,True,0.9965635738831616 +10.1101/2020.04.24.059840,10.15252/embj.2019104136,False,True,0.9988344988344988 +10.5194/hessd-5-2791-2008,10.5194/hess-13-467-2009,True,True,1.0 +10.21034/wp.274,10.2307/1391384,False,True,0.9977324263038548 +10.21034/sr.498,10.1086/707735,False,True, +10.5194/hessd-12-8091-2015,10.5194/hess-20-175-2016,True,True,0.9803921568627452 +10.1101/2020.03.02.972521,10.1016/j.foreco.2020.118344,False,True,1.0 +10.5194/acp-2015-1028,10.5194/acp-16-6041-2016,True,True,0.9331369079944484 +10.20944/preprints202011.0348.v1,10.3390/en14030635,False,True,1.0 +10.20944/preprints201609.0106.v2,10.3390/mca22010017,False,True,0.9957446808510638 +10.20944/preprints201609.0106.v1,10.3390/mca22010017,False,True,0.9957446808510638 +10.21203/rs.3.rs-38976/v2,10.1186/s12960-020-00532-5,False,True,0.9913644214162348 +10.21203/rs.3.rs-38976/v1,10.1186/s12960-020-00532-5,False,True,0.9913644214162348 +10.21203/rs.3.rs-38976/v3,10.1186/s12960-020-00532-5,False,True,1.0 +10.5194/acpd-4-399-2004,10.5194/acp-4-801-2004,True,True,0.9197530864197532 +10.5194/acp-2021-58,10.5194/acp-21-13483-2021,True,True,1.0 +10.5194/cpd-7-4173-2011,10.5194/cp-8-855-2012,True,True,1.0 +10.5194/hess-2022-117,10.5194/hess-26-4953-2022,True,True,1.0 +10.1101/059329,10.1093/nar/gkw627,False,True,1.0 +10.1101/054247,10.1016/j.neuron.2016.08.007,False,True,1.0 +10.5194/bg-2017-53,10.5194/bg-15-13-2018,True,True,1.0 +10.5194/acpd-2-1735-2002,10.5194/acp-3-303-2003,True,True,1.0 +10.1101/2021.06.09.447533,10.7554/elife.71569,True,False,0.979381443298969 +10.26434/chemrxiv.14541432.v1,10.1021/acscentsci.1c00592,False,True,1.0 +10.5194/esurfd-1-1-2013,10.5194/esurf-1-1-2013,True,True,1.0 +10.21203/rs.3.rs-995821/v1,10.1007/s10533-022-00915-x,False,True,1.0 +10.26434/chemrxiv.11985357.v1,10.1021/acschemneuro.0c00479,False,True,0.9789397240377632 +10.26434/chemrxiv.11985357,10.1021/acschemneuro.0c00479,False,True,0.9984567901234568 +10.1101/185520,10.1038/npp.2017.250,False,True,1.0 +10.21203/rs.3.rs-39782/v1,10.1186/s12885-021-07994-3,False,True, +10.21203/rs.3.rs-39782/v2,10.1186/s12885-021-07994-3,False,True,1.0 +10.21034/sr.516,10.1257/aer.20151260,False,True,1.0 +10.2196/preprints.16665,10.2196/16665,False,True,1.0 +10.20944/preprints201912.0205.v1,10.3390/ijerph17020616,False,True,1.0 +10.20944/preprints201912.0205.v2,10.3390/ijerph17020616,False,True,1.0 +10.1101/465096,10.1371/journal.ppat.1007460,False,True,0.9405399726862064 +10.1101/031260,10.1103/physrevlett.116.248101,False,True,1.0 +10.31234/osf.io/p5gns,10.1017/s0033291721001306,False,True,0.9601748959617086 +10.1101/2020.07.16.207662,10.1002/advs.202001572,False,True,1.0 +10.1101/135814,10.1093/nar/gkx607,False,True,0.9864208543958768 +10.1101/2021.05.03.442388,10.1523/jneurosci.0933-21.2021,False,True,1.0 +10.1101/542381,10.15252/msb.20209880,False,True,0.9792843691148776 +10.1101/2020.09.28.316653,10.1016/j.molcel.2020.10.031,False,True,1.0 +10.1101/370775,10.1136/bmjopen-2018-026211,False,True, +10.20944/preprints202009.0192.v1,10.3390/cancers12102798,False,True,0.9914529914529916 +10.1101/430124,10.26508/lsa.201800162,False,True,1.0 +10.1101/2022.09.05.506603,10.7554/elife.83153,True,False,0.9729729729729728 +10.5194/acp-2020-543,10.5194/acp-21-3395-2021,True,True,0.963226571767497 +10.5194/essd-2019-118,10.5194/essd-12-789-2020,True,True,1.0 +10.1101/2020.05.04.077040,10.1016/j.nicl.2020.102353,False,True,1.0 +10.5194/acp-2020-909,10.5194/acp-21-8915-2021,True,True,1.0 +10.1101/2020.05.06.081356,10.1002/bit.27473,False,True,0.9810874704491724 +10.1101/2020.10.29.355859,10.1038/s41388-021-01876-5,False,True,0.9841269841269842 +10.5194/acp-2020-674,10.5194/acp-21-2305-2021,True,True,0.9862258953168044 +10.26434/chemrxiv.9684470.v1,10.1021/acsmacrolett.9b00717,False,True,0.9807852965747702 +10.5194/acp-2019-580,10.5194/acp-20-753-2020,True,True,0.9407407407407408 +10.5194/tc-2022-217,10.5194/tc-17-3593-2023,True,True,1.0 +10.5194/amt-2018-397,10.5194/amt-12-2819-2019,True,True,0.9985569985569984 +10.5194/acpd-11-17879-2011,10.5194/acp-11-9237-2011,True,True,1.0 +10.1101/2020.03.13.990887,10.1038/s41594-020-0465-x,False,True,0.9803921568627452 +10.5194/acpd-14-7141-2014,10.5194/acp-14-10411-2014,True,True,0.948073701842546 +10.26434/chemrxiv.12743720,10.1021/acssensors.0c02264,False,True,0.9817042606516292 +10.26434/chemrxiv.12743720.v1,10.1021/acssensors.0c02264,False,True,0.986466165413534 +10.21203/rs.3.rs-2145653/v1,10.1038/s41388-022-02585-3,False,True,0.914092014536055 +10.1101/2020.05.20.106575,10.15252/embj.2020106230,False,True,0.9282787454386976 +10.31234/osf.io/j2bzc,10.1016/j.jad.2022.12.162,False,True,1.0 +10.2196/preprints.20457,10.2196/20457,False,True,1.0 +10.5194/acp-2018-761,10.5194/acp-19-233-2019,True,True,0.980213089802131 +10.5194/amt-2018-258,10.5194/amt-12-955-2019,True,True,0.99860529986053 +10.20944/preprints202010.0084.v2,10.3390/cancers12113327,False,True,1.0 +10.20944/preprints202010.0084.v1,10.3390/cancers12113327,False,True,1.0 +10.21203/rs.3.rs-32295/v1,10.1186/s13287-020-02000-2,False,True,1.0 +10.21203/rs.3.rs-32295/v2,10.1186/s13287-020-02000-2,False,True,1.0 +10.1101/760777,10.1007/s13205-020-2084-y,False,True,0.9902370990237098 +10.1101/591065,10.1371/journal.pgen.1008501,False,True,1.0 +10.2196/preprints.25469,10.2196/25469,False,True,1.0 +10.1101/276618,10.1186/s12885-018-4757-z,False,True,1.0 +10.1101/055863,10.1038/nmeth.4108,False,True,1.0 +10.1101/2022.02.11.479825,10.1172/jci159402,False,True,0.9957805907172996 +10.1101/549873,10.1111/oik.07213,False,True,0.9989615784008308 +10.5194/tc-2019-30,10.5194/tc-13-1709-2019,True,True,1.0 +10.20944/preprints202103.0467.v1,10.3390/rs13081581,False,True,1.0 +10.5194/amtd-7-5491-2014,10.5194/amt-8-1701-2015,True,True,0.9948717948717948 +10.1101/2020.04.28.066605,10.3389/fcell.2020.00617,False,True,1.0 +10.1101/2021.12.22.473713,10.1002/advs.202200315,False,True,0.9963369963369964 +10.21203/rs.3.rs-2240657/v1,10.1038/s41467-023-35915-4,False,True,1.0 +10.5194/acp-2021-182,10.5194/acp-21-9329-2021,True,True,1.0 +10.1101/416305,10.1021/jacs.8b10840,False,True,1.0 +10.1101/127761,10.1186/s13059-017-1218-y,False,True,1.0 +10.5194/bg-2022-101,10.5194/bg-19-4655-2022,True,True,0.996376811594203 +10.1101/2020.01.13.905471,10.1523/jneurosci.2809-19.2020,False,True,0.9659090909090908 +10.1101/857987,10.1523/jneurosci.1468-19.2020,False,True,0.89788748538998 +10.21203/rs.3.rs-38299/v1,10.1186/s13018-020-02039-0,False,True, +10.21203/rs.3.rs-38299/v2,10.1186/s13018-020-02039-0,False,True,1.0 +10.20944/preprints202007.0501.v1,10.3390/en13174422,False,True,1.0 +10.21203/rs.3.rs-1523403/v1,10.1038/s41591-022-02202-6,False,True,0.9626833586851126 +10.26434/chemrxiv.9994940.v1,10.1021/acs.jpclett.0c00121,False,True,0.992248062015504 +10.21203/rs.3.rs-32573/v1,10.1186/s12985-020-01417-8,False,True,1.0 +10.21203/rs.3.rs-32573/v2,10.1186/s12985-020-01417-8,False,True,1.0 +10.21203/rs.3.rs-17623/v1,10.1007/s40145-020-0410-9,False,True,0.9696969696969696 +10.21203/rs.3.rs-17623/v2,10.1007/s40145-020-0410-9,False,True,0.9696969696969696 +10.5194/tcd-9-2597-2015,10.5194/tc-9-2201-2015,True,True,1.0 +10.5194/hessd-10-15771-2013,10.5194/hess-18-2287-2014,True,True,0.9458128078817736 +10.21203/rs.2.16987/v2,10.1186/s12881-020-01156-1,False,True,0.9550997150997153 +10.21203/rs.2.16987/v1,10.1186/s12881-020-01156-1,False,True,0.9550997150997153 +10.21203/rs.2.16987/v4,10.1186/s12881-020-01156-1,False,True,0.9550997150997153 +10.21203/rs.2.16987/v3,10.1186/s12881-020-01156-1,False,True,0.9743589743589745 +10.5194/acp-2021-173,10.5194/acp-21-15023-2021,True,True,0.9869061137513844 +10.20944/preprints201808.0402.v1,10.3390/s18113670,False,True,1.0 +10.26434/chemrxiv.13055873.v2,10.1021/acsomega.0c04691,False,True,1.0 +10.26434/chemrxiv.13055873,10.1021/acsomega.0c04691,False,True,1.0 +10.26434/chemrxiv.13055873.v1,10.1021/acsomega.0c04691,False,True,1.0 +10.5194/acpd-11-163-2011,10.5194/acp-11-9683-2011,True,True,0.994017094017094 +10.5194/acpd-13-2795-2013,10.5194/acp-13-8607-2013,True,False,1.0 +10.2196/preprints.16513,10.2196/16513,False,True,1.0 +10.5194/bgd-6-11035-2009,10.5194/bg-7-1443-2010,True,True,0.9955357142857144 +10.1101/2021.03.16.435577,10.1002/glia.24106,False,True,0.9912609238451936 +10.21203/rs.3.rs-65568/v2,10.1186/s13643-021-01612-w,False,True,1.0 +10.21203/rs.3.rs-65568/v1,10.1186/s13643-021-01612-w,False,True,1.0 +10.21034/wp.742,10.1257/mac.20170367,False,True,1.0 +10.26434/chemrxiv-2021-k4v9r,10.1021/jacs.1c09321,False,True,1.0 +10.1101/2021.06.10.447962,10.1002/smll.202103552,False,True,0.971118761485915 +10.21203/rs.2.13144/v4,10.1186/s12879-019-4618-7,False,True,1.0 +10.21203/rs.2.13144/v3,10.1186/s12879-019-4618-7,False,True,1.0 +10.21203/rs.2.13144/v2,10.1186/s12879-019-4618-7,False,True,1.0 +10.21203/rs.2.13144/v1,10.1186/s12879-019-4618-7,False,True,1.0 +10.26434/chemrxiv.12369758.v1,10.1021/acschembio.0c00426,False,True,0.903858024691358 +10.26434/chemrxiv.12369758,10.1021/acschembio.0c00426,False,True,0.903858024691358 +10.5194/bgd-5-3157-2008,10.5194/bg-6-405-2009,True,True,1.0 +10.1101/2020.05.03.20089383,10.1016/j.bbi.2020.08.021,False,True,1.0 +10.21203/rs.3.rs-1293101/v1,10.1038/s41562-023-01540-w,False,True,0.8857142857142857 +10.5194/gmd-2018-20,10.5194/gmd-11-2813-2018,True,True,1.0 +10.1101/2020.08.12.248005,10.7554/elife.57436,True,True,0.9803921568627452 +10.1101/173146,10.1099/mgen.0.000166,False,True,1.0 +10.2196/preprints.23254,10.2196/23254,False,True,1.0 +10.5194/gmd-2017-263,10.5194/gmd-11-3187-2018,True,True,1.0 +10.2196/preprints.24006,10.2196/24006,False,True,1.0 +10.1101/2021.02.08.21251234,10.1093/ajcn/nqab276,False,True,1.0 +10.5194/acp-2016-770,10.5194/acp-17-7067-2017,True,True,1.0 +10.5194/cpd-11-3187-2015,10.5194/cp-12-91-2016,True,True,1.0 +10.2196/preprints.14369,10.2196/14369,False,True,1.0 +10.5194/mr-2023-2,10.5194/mr-4-153-2023,True,True,0.9080459770114944 +10.2196/preprints.19018,10.2196/19018,False,True,1.0 +10.5194/gmd-2017-206,10.5194/gmd-11-2975-2018,True,True,0.9471620227038184 +10.1101/481507,10.1096/fj.201902811rr,False,True,0.9894179894179894 +10.21203/rs.3.rs-57499/v1,10.1186/s10020-020-00230-x,False,True,0.9711286089238844 +10.21203/rs.3.rs-57499/v2,10.1186/s10020-020-00230-x,False,True,1.0 +10.1101/151522,10.1016/j.jneumeth.2017.08.033,False,True,1.0 +10.5194/acpd-7-6767-2007,10.5194/acp-7-4553-2007,True,True,1.0 +10.26434/chemrxiv.7322330.v1,10.1021/jacs.8b13127,False,True,0.92018779342723 +10.1101/2020.02.19.955609,10.1177/2331216520964068,False,True,1.0 +10.5194/amt-2018-209,10.5194/amt-12-977-2019,True,True,1.0 +10.1101/2020.03.09.20033423,10.1002/jia2.25546,False,True,0.9957624290957624 +10.1101/342592,10.1186/s12864-018-5032-z,False,True,0.985685071574642 +10.31234/osf.io/qp4ev,10.1016/j.beth.2019.09.005,False,True,1.0 +10.21203/rs.2.12994/v1,10.1186/s12879-020-4873-7,False,True,0.9984567901234568 +10.21203/rs.2.12994/v2,10.1186/s12879-020-4873-7,False,True,0.9984567901234568 +10.21203/rs.2.12994/v3,10.1186/s12879-020-4873-7,False,True,0.9984567901234568 +10.21203/rs.2.12994/v4,10.1186/s12879-020-4873-7,False,True,0.9984567901234568 +10.20944/preprints202011.0543.v1,10.3390/pathogens9121037,False,True,0.9727626459143968 +10.21203/rs.3.rs-76084/v1,10.1186/s12960-021-00558-3,False,True,1.0 +10.1101/776237,10.7554/elife.70469,True,True,1.0 +10.1101/2021.10.03.462935,10.7554/elife.74183,True,True,0.9227481919789612 +10.5194/tc-2016-250,10.5194/tc-11-949-2017,True,True,1.0 +10.1101/192245,10.1162/jocn_a_01200,False,True,1.0 +10.2196/preprints.15960,10.2196/15960,False,True,1.0 +10.2196/preprints.22795,10.2196/22795,False,True,1.0 +10.5194/hessd-8-8291-2011,10.5194/hess-16-167-2012,True,True,1.0 +10.5194/gmd-2016-87,10.5194/gmd-9-3655-2016,True,True,0.9993412384716732 +10.1101/2020.02.17.952895,10.3389/fcimb.2020.00405,False,True, +10.1101/2021.02.15.21251449,10.1093/ajcn/nqab279,False,True,1.0 +10.5194/essd-2020-280,10.5194/essd-13-2995-2021,True,True,0.98989898989899 +10.21203/rs.3.rs-23615/v3,10.1186/s12985-020-01451-6,False,True,1.0 +10.21203/rs.3.rs-23615/v2,10.1186/s12985-020-01451-6,False,True,1.0 +10.21203/rs.3.rs-23615/v1,10.1186/s12985-020-01451-6,False,True,1.0 +10.5194/gmd-2018-123,10.5194/gmd-11-4843-2018,True,True,0.9679545950665794 +10.2196/preprints.43101,10.2196/43101,False,True,1.0 +10.5194/angeo-2019-65,10.5194/angeo-37-689-2019,True,True,1.0 +10.5194/os-2020-66,10.5194/os-17-59-2021,True,True,1.0 +10.5194/acp-2022-410,10.5194/acp-23-1963-2023,True,True,0.9946524064171124 +10.21203/rs.3.rs-42553/v2,10.1186/s13756-020-00864-w,False,True,1.0 +10.21203/rs.3.rs-42553/v1,10.1186/s13756-020-00864-w,False,True,0.9852941176470588 +10.20944/preprints201905.0040.v1,10.3390/ijms20112780,False,True,1.0 +10.1101/2020.03.17.20037515,10.1093/cid/ciaa443,False,True,0.9732868757259 +10.20944/preprints202002.0288.v1,10.3390/brainsci10030143,False,True,0.9984848484848484 +10.20944/preprints202007.0130.v1,10.3390/biomedicines8080275,False,True,1.0 +10.1101/128645,10.1002/hbm.23843,False,True,0.987962962962963 +10.1101/419994,10.1111/jfb.13989,False,True, +10.1101/2020.03.20.000000,10.1096/fj.202001281rr,False,True,0.9926091269841272 +10.26434/chemrxiv-2022-c1ctc-v2,10.1039/d2sc05997e,False,True,1.0 +10.2196/preprints.12664,10.2196/12664,False,True,1.0 +10.5194/bg-2018-477,10.5194/bg-16-2147-2019,True,True,1.0 +10.21034/wp.75,10.2307/1991332,False,True,1.0 +10.1101/2021.12.13.472383,10.7554/elife.78092,True,True,1.0 +10.1101/105874,10.1016/j.neuroimage.2017.04.063,False,True,1.0 +10.1101/2022.10.25.513707,10.7554/elife.83908,True,False,0.9799631120053656 +10.26434/chemrxiv.12587537,10.1021/acs.jctc.0c00715,False,True,1.0 +10.26434/chemrxiv.12587537.v1,10.1021/acs.jctc.0c00715,False,True,1.0 +10.1101/378497,10.1093/nar/gkz169,False,True,1.0 +10.31223/osf.io/3mjc2,10.1016/j.precamres.2020.105849,False,True,0.9765684051398336 +10.1101/439687,10.1002/2211-5463.12744,False,True,0.987987987987988 +10.26434/chemrxiv-2023-6tgkh,10.1021/acs.jcim.3c00732,False,True,1.0 +10.5194/gmd-2020-179,10.5194/gmd-13-6077-2020,True,True,0.9718076285240466 +10.2196/preprints.22564,10.2196/22564,False,True,1.0 +10.1101/617019,10.3390/genes10060468,False,True,0.8909691867586605 +10.1101/068346,10.1371/journal.pcbi.1005260,False,True,0.8888888888888888 +10.1101/2020.08.15.252494,10.1161/atvbaha.120.315556,False,True,0.915073340051506 +10.1101/612010,10.3390/su11102787,False,True,0.9824561403508772 +10.1101/2020.03.28.013672,10.1016/j.devcel.2020.05.012,False,True,0.9743589743589745 +10.1101/122945,10.1162/netn_a_00031,False,True,1.0 +10.1101/2020.12.16.423042,10.1007/s10334-022-01033-3,False,True,1.0 +10.1101/2021.07.20.453033,10.7554/elife.73348,True,True,1.0 +10.1101/2021.03.22.21254119,10.1021/acs.jproteome.1c00326,False,True,1.0 +10.1101/208223,10.1038/s41380-018-0023-7,False,True,1.0 +10.1101/354829,10.1002/hbm.24788,False,True,0.9956140350877192 +10.1101/501221,10.1523/jneurosci.0601-21.2021,False,True,1.0 +10.31219/osf.io/y6mkh,10.1111/nous.12265,False,True, +10.5194/bgd-11-7615-2014,10.5194/bg-11-6323-2014,True,True,1.0 +10.1101/641159,10.1038/s41593-021-00821-9,False,True,1.0 +10.5194/hessd-11-1343-2014,10.5194/hess-18-3259-2014,True,True,0.9955555555555556 +10.26434/chemrxiv-2021-rg4wj-v2,10.1016/j.jcis.2022.07.164,False,True,0.983974358974359 +10.21203/rs.3.rs-515297/v1,10.1016/j.bbamem.2021.183794,False,True,0.9827586206896552 +10.1101/2020.05.06.081562,10.1021/acs.jpcb.0c04139,False,True,1.0 +10.1101/2020.02.17.952457,10.1371/journal.pbio.3000687,False,True,0.9978213507625272 +10.21203/rs.3.rs-31943/v4,10.1186/s12876-020-01553-z,False,True,1.0 +10.21203/rs.3.rs-31943/v3,10.1186/s12876-020-01553-z,False,True,1.0 +10.21203/rs.3.rs-31943/v2,10.1186/s12876-020-01553-z,False,True,1.0 +10.21203/rs.3.rs-31943/v1,10.1186/s12876-020-01553-z,False,True,0.9862258953168044 +10.1101/067876,10.15252/embj.201696038,False,True,0.8719135802469135 +10.1101/2020.06.24.169334,10.1021/acscentsci.1c01293,False,True,0.9753872555660932 +10.1101/138834,10.3389/fncel.2017.00214,False,True,1.0 +10.21203/rs.3.rs-70874/v1,10.1016/j.wasman.2022.01.022,False,True,0.9464007899260638 +10.5194/essd-2017-134,10.5194/essd-10-985-2018,True,True,1.0 +10.5194/hess-2021-68,10.5194/hess-25-6495-2021,True,True,0.9129097148266476 +10.20944/preprints201612.0138.v1,10.3390/ma10030297,False,True,1.0 +10.1101/505032,10.1038/s41592-020-01023-0,False,True,0.996078431372549 +10.1101/671230,10.1038/s41556-020-0485-0,False,True,1.0 +10.21203/rs.3.rs-66113/v1,10.1186/s13287-020-02056-0,False,True,0.9678362573099416 +10.21203/rs.3.rs-66113/v2,10.1186/s13287-020-02056-0,False,True,1.0 +10.21203/rs.3.rs-66113/v3,10.1186/s13287-020-02056-0,False,True,1.0 +10.1101/038117,10.1038/nmeth.3991,False,True,0.9095238095238096 +10.20944/preprints201808.0322.v1,10.3390/molecules23102549,False,True,1.0 +10.1101/2020.07.30.228924,10.1038/s41592-021-01136-0,False,True,1.0 +10.21203/rs.3.rs-904665/v1,10.1038/s41556-022-00953-5,False,True,0.9195027195027196 +10.2196/preprints.22488,10.2196/22488,False,True,1.0 +10.1101/2022.03.04.483005,10.7554/elife.78385,True,False,0.9462465245597776 +10.1101/848846,10.1182/blood.2020004801,False,True,0.9846216768916156 +10.2196/preprints.10755,10.2196/10755,False,True,1.0 +10.1101/057976,10.1093/bioinformatics/btw390,False,True,0.9118723052546582 +10.5194/tc-2018-175,10.5194/tc-13-895-2019,True,True,0.9646464646464646 +10.5194/esurf-2020-59,10.5194/esurf-9-1153-2021,True,True,0.9936507936507936 +10.5194/cp-2017-151,10.5194/cp-14-1079-2018,True,True,1.0 +10.1101/2020.04.07.029140,10.1371/journal.ppat.1008530,False,True,1.0 +10.2196/preprints.26309,10.2196/26309,False,True,1.0 +10.1101/2021.10.14.464354,10.7554/elife.74565,True,True,1.0 +10.1101/860874,10.1088/1741-2552/ab9dba,False,True,1.0 +10.1101/207076,10.1167/18.6.10,False,True,1.0 +10.1101/376863,10.1152/jn.00680.2018,False,True,0.9885129490392648 +10.5194/bg-2019-482,10.5194/bg-17-4247-2020,True,True,0.9792843691148776 +10.2196/preprints.13802,10.2196/13802,False,True,1.0 +10.2196/preprints.17740,10.2196/17740,False,True,1.0 +10.2196/preprints.11334,10.2196/11334,False,True,1.0 +10.31235/osf.io/hfr96,10.1038/nclimate3271,False,True,0.9826839826839828 +10.1101/2021.04.30.442171,10.1523/jneurosci.1575-21.2022,False,True,1.0 +10.1101/228668,10.1099/mgen.0.000165,False,True,0.9875222816399286 +10.5194/nhess-2017-152,10.5194/nhess-17-2199-2017,True,True,0.9588652482269504 +10.5194/amt-2019-282,10.5194/amt-13-323-2020,True,True,0.9693251533742332 +10.5194/acpd-12-24847-2012,10.5194/acp-13-3345-2013,True,True,1.0 +10.5194/hessd-9-9809-2012,10.5194/hess-17-3127-2013,True,True,0.9061032863849764 +10.5194/bgd-11-14699-2014,10.5194/bg-12-863-2015,True,True,0.9968253968253968 +10.2196/preprints.24851,10.2196/24851,False,True,1.0 +10.1101/263939,10.1038/nbt.4266,False,True, +10.1101/2020.05.05.078196,10.1002/brb3.1786,False,True,0.9953703703703703 +10.1101/191809,10.1002/pld3.47,False,True,0.96640826873385 +10.26434/chemrxiv.10003412.v1,10.1021/acs.jctc.9b01066,False,True,1.0 +10.20944/preprints201706.0002.v1,10.3390/e19060286,False,True,1.0 +10.31235/osf.io/fw4er,10.1093/aje/kwy218,False,True,1.0 +10.1101/191494,10.1111/ejn.13816,False,True,1.0 +10.5194/acp-2021-784,10.5194/acp-22-1951-2022,True,True,0.9819277108433736 +10.1101/2020.06.29.20143180,10.1371/journal.pone.0242758,False,True,1.0 +10.2196/preprints.25456,10.2196/25456,False,True,1.0 +10.5194/cpd-9-3239-2013,10.5194/cp-10-487-2014,True,True,0.9954954954954954 +10.1002/essoar.10510350.1,10.1029/2022gl098158,False,True,0.9988505747126436 +10.1101/844712,10.1371/journal.pone.0237189,False,True,1.0 +10.5194/se-2017-35,10.5194/se-8-789-2017,True,True,0.9942726231386024 +10.1101/111070,10.1037/xlm0000518,False,True,0.9186480453521352 +10.5194/se-2021-6,10.5194/se-12-2523-2021,True,True,0.9875311720698252 +10.1101/158113,10.1371/journal.pone.0214311,False,True,1.0 +10.1101/2021.05.06.21256789,10.1213/ane.0000000000005730,False,True,0.9975490196078431 +10.1101/867168,10.1016/j.jneumeth.2020.108756,False,True,0.875 +10.1101/661207,10.3389/fmicb.2019.02558,False,True,1.0 +10.1101/391243,10.1534/g3.118.200662,False,True,0.9444444444444444 +10.2196/preprints.44548,10.2196/44548,False,True,1.0 +10.1101/101535,10.1098/rsos.171308,False,True,0.9969418960244648 +10.5194/bg-2018-430,10.5194/bg-16-1225-2019,True,True,1.0 +10.1101/534206,10.1093/nar/gkz306,False,True,0.9427618157089428 +10.2196/preprints.18662,10.2196/18662,False,True,1.0 +10.5194/nhess-2021-31,10.5194/nhess-21-1759-2021,True,True,1.0 +10.21203/rs.3.rs-60829/v1,10.1186/s13046-020-01796-4,False,True,0.9891156462585036 +10.21203/rs.3.rs-60829/v2,10.1186/s13046-020-01796-4,False,True,0.9986772486772488 +10.1101/2020.06.12.20127944,10.1001/jama.2020.15580,False,True,0.978494623655914 +10.20944/preprints201801.0107.v1,10.3390/nu10020238,False,True,1.0 +10.5194/hess-2016-351,10.5194/hess-21-1741-2017,True,True,0.9983579638752051 +10.5194/essd-2020-303,10.5194/essd-13-3337-2021,True,True,1.0 +10.1101/2021.10.07.463355,10.1021/acssensors.1c02201,False,True,1.0 +10.5194/hess-2021-506,10.5194/hess-26-2899-2022,True,True,0.912 +10.1101/219113,10.1016/j.sbi.2018.01.009,False,True,1.0 +10.1101/632810,10.3390/cancers12061568,False,True,0.9866666666666668 +10.2196/preprints.41446,10.2196/41446,False,True,1.0 +10.21203/rs.3.rs-2209582/v1,10.1007/s13146-023-00880-y,False,True,1.0 +10.21203/rs.3.rs-2440941/v1,10.1007/s13146-023-00882-w,False,True,1.0 +10.21203/rs.3.rs-2597108/v1,10.1007/s10238-023-01049-6,False,True,0.9743589743589745 +10.1101/2020.05.27.119438,10.1371/journal.pcbi.1008625,False,True,0.9843400447427294 +10.20944/preprints201902.0019.v1,10.3390/a12030060,False,True,1.0 +10.1101/816694,10.1182/bloodadvances.2019001393,False,True,0.9716981132075472 +10.5194/acp-2016-430,10.5194/acp-17-11041-2017,True,True,0.9962546816479402 +10.31234/osf.io/hv28a,10.1037/pspa0000098,False,True,1.0 +10.20944/preprints201612.0042.v1,10.3390/ijms18020347,False,True,0.9297052154195012 +10.1101/2019.12.15.876847,10.1093/sleep/zsaa111,False,True,1.0 +10.5194/acpd-14-25533-2014,10.5194/acp-15-4179-2015,True,True,1.0 +10.5194/acp-2016-308,10.5194/acp-16-12397-2016,True,True,1.0 +10.1101/235176,10.1038/s41592-018-0171-3,False,True,0.9691282491742363 +10.21034/wp.730,10.1257/aer.20121524,False,True,1.0 +10.1101/636803,10.1002/ece3.6313,False,True,0.9895833333333334 +10.5194/cpd-5-1367-2009,10.5194/cp-5-585-2009,True,True,0.989010989010989 +10.36227/techrxiv.21758660,10.1109/tim.2023.3256468,False,True,1.0 +10.36227/techrxiv.21758660.v1,10.1109/tim.2023.3256468,False,True,1.0 +10.31234/osf.io/y27vc,10.1080/13548506.2017.1385818,False,True,1.0 +10.5194/cp-2017-26,10.5194/cp-13-1007-2017,True,True,1.0 +10.5194/acpd-15-12007-2015,10.5194/acp-15-11861-2015,True,True,1.0 +10.5194/wes-2018-49,10.5194/wes-3-845-2018,True,True,1.0 +10.31219/osf.io/cv2bn,10.3758/s13428-018-1035-6,False,True,1.0 +10.1101/803346,10.1016/j.ajhg.2020.06.010,False,True,0.9377207977207976 +10.36227/techrxiv.21674759.v1,10.1109/ojcoms.2023.3282814,False,True,1.0 +10.36227/techrxiv.21674759,10.1109/ojcoms.2023.3282814,False,True,1.0 +10.31219/osf.io/bwm4k,10.3390/ma14051106,False,True,1.0 +10.1101/084418,10.15252/msb.20188497,False,True,1.0 +10.5194/egusphere-2022-180,10.5194/se-13-1755-2022,True,True,0.9826224328593997 +10.1101/537001,10.1016/j.cell.2019.07.038,False,True, +10.21203/rs.3.rs-136528/v1,10.1186/s13287-021-02223-x,False,True,0.9743589743589745 +10.1101/640557,10.1111/oik.06957,False,True,0.9883190883190884 +10.21034/sr.361,10.1257/mac.1.1.146,False,True, +10.26434/chemrxiv.8289812.v1,10.1021/acsnano.9b06019,False,True,0.9314194577352471 +10.5194/amtd-2-489-2009,10.5194/amt-2-379-2009,True,True,1.0 +10.1101/2020.08.27.267880,10.1186/s13229-022-00511-8,False,True,0.9561904761904764 +10.5194/acp-2019-1026,10.5194/acp-20-8727-2020,True,True,0.983606557377049 +10.5194/se-2019-49,10.5194/se-10-987-2019,True,True,1.0 +10.5194/acp-2020-263,10.5194/acp-21-1697-2021,True,True,0.9936073059360732 +10.21034/sr.186,10.1007/bf01213946,False,True,0.974910394265233 +10.5194/acpd-8-8009-2008,10.5194/acp-8-6169-2008,True,True,1.0 +10.5194/acpd-7-10799-2007,10.5194/acp-8-901-2008,True,True,1.0 +10.5194/acpd-11-8665-2011,10.5194/acp-11-6207-2011,True,True,0.9209742194584792 +10.5194/acpd-6-9003-2006,10.5194/acp-7-685-2007,True,True,1.0 +10.5194/acpd-14-19791-2014,10.5194/acp-15-253-2015,True,True,1.0 +10.5194/acpd-4-4545-2004,10.5194/acp-4-2227-2004,True,True,1.0 +10.1101/863621,10.21105/joss.01994,False,True,0.927811176648518 +10.5194/acpd-4-2569-2004,10.5194/acp-4-1895-2004,True,True,1.0 +10.2196/preprints.9633,10.2196/resprot.9633,False,True,1.0 +10.2196/preprints.19601,10.2196/19601,False,True,1.0 +10.5194/bg-2019-237,10.5194/bg-17-215-2020,True,True,0.9548387096774192 +10.20944/preprints202103.0379.v1,10.3390/genes12040544,False,True,1.0 +10.21203/rs.3.rs-72276/v1,10.1186/s13643-021-01652-2,False,True,0.978593272171254 +10.5194/acpd-13-20677-2013,10.5194/acp-14-1423-2014,True,True,1.0 +10.5194/acpd-10-10219-2010,10.5194/acp-10-7169-2010,True,True,1.0 +10.1101/2021.09.06.21263001,10.1111/nmo.14331,False,True,0.942927545452176 +10.1101/230938,10.1016/j.yjmcc.2018.06.007,False,True,0.9944444444444444 +10.5194/gchron-2020-11,10.5194/gchron-3-181-2021,True,True,1.0 +10.5194/hessd-8-4459-2011,10.5194/hess-15-2581-2011,True,True,1.0 +10.1101/2020.04.05.026005,10.1016/j.molliq.2020.113612,False,True,0.9987029831387808 +10.1101/2022.12.07.519455,10.7554/elife.85069,True,False, +10.20944/preprints202003.0433.v1,10.1016/j.micpath.2020.104236,False,True,1.0 +10.5194/acp-2016-332,10.5194/acp-16-13185-2016,True,True,1.0 +10.5194/amt-2020-257,10.5194/amt-14-945-2021,True,True,0.9913644214162348 +10.20944/preprints202010.0453.v1,10.3390/ani10122196,False,True,0.9775910364145658 +10.20944/preprints202010.0453.v2,10.3390/ani10122196,False,True,0.9716981132075472 +10.21034/wp.741,10.1257/aer.20181499,False,True,1.0 +10.1101/2020.04.25.20079996,10.3389/fpsyg.2020.551004,False,True,0.9986504723346828 +10.1101/443127,10.1007/s00415-019-09340-x,False,True,1.0 +10.5194/essd-2022-16,10.5194/essd-14-3743-2022,True,True,1.0 +10.5194/soil-2017-28,10.5194/soil-4-37-2018,True,True,0.9715242881072026 +10.5194/hess-2016-323,10.5194/hess-21-1149-2017,True,True,0.9643605870020964 +10.1101/511683,10.1186/s40168-019-0665-y,False,True,0.9893444246385422 +10.1101/088666,10.7717/peerj.3889,False,True, +10.1101/2021.03.28.21254404,10.1021/acs.estlett.1c00375,False,True,0.959078814570144 +10.5194/gmd-2017-103,10.5194/gmd-11-257-2018,True,True,0.9954415954415956 +10.5194/acpd-8-21229-2008,10.5194/acp-9-5905-2009,True,True,0.9964912280701754 +10.5194/acpd-4-3699-2004,10.5194/acp-4-2337-2004,True,True,0.9767441860465116 +10.20944/preprints202010.0447.v1,10.3390/cancers12123524,False,True,1.0 +10.5194/bgd-10-19509-2013,10.5194/bg-11-2069-2014,True,True,0.9797979797979798 +10.5194/acpd-13-18345-2013,10.5194/acp-13-12271-2013,True,True,0.9987515605493132 +10.1101/296061,10.1016/j.dcn.2018.09.003,False,True,1.0 +10.21034/sr.249,10.1080/07474939908800428,False,True,0.988155668358714 +10.21203/rs.3.rs-93388/v1,10.1186/s12872-020-01827-0,False,True,1.0 +10.2196/preprints.8954,10.2196/jmir.8954,False,True,1.0 +10.5194/bgd-7-3335-2010,10.5194/bg-7-2613-2010,True,True,0.9688888888888888 +10.1002/essoar.10506462.1,10.1029/2021jc017734,False,True,0.9967320261437908 +10.5194/acpd-2-2209-2002,10.5194/acp-3-417-2003,True,True,0.9904761904761904 +10.5194/acpd-8-18727-2008,10.5194/acp-9-5489-2009,True,True,1.0 +10.5194/tc-2020-164,10.5194/tc-15-1097-2021,True,True,1.0 +10.5194/acpd-13-10621-2013,10.5194/acp-14-765-2014,True,True,0.9565217391304348 +10.21203/rs.3.rs-507826/v1,10.1007/s10924-021-02297-x,False,True,0.9696969696969696 +10.5194/acp-2018-209,10.5194/acp-18-12207-2018,True,True,0.9986504723346828 +10.1101/2021.04.19.440546,10.1038/s42003-021-02874-9,False,True,1.0 +10.5194/acpd-5-509-2005,10.5194/acp-5-1557-2005,True,True,1.0 +10.26434/chemrxiv.13513731.v2,10.1021/acs.jpca.1c02872,False,True, +10.5194/acpd-6-3135-2006,10.5194/acp-6-3377-2006,True,True,1.0 +10.5194/bgd-11-7991-2014,10.5194/bg-11-6173-2014,True,True,0.9916161616161616 +10.5194/gchron-2019-3,10.5194/gchron-1-17-2019,True,True,0.9760765550239232 +10.1101/430447,10.15252/embj.2019103667,False,True, +10.5194/acpd-9-16549-2009,10.5194/acp-10-431-2010,True,True,1.0 +10.5194/bg-2016-101,10.5194/bg-13-4491-2016,True,True,0.9313034188034188 +10.1101/2020.05.17.100255,10.1016/j.cortex.2020.09.004,False,True,1.0 +10.5194/acpd-3-5139-2003,10.5194/acp-4-391-2004,True,True,1.0 +10.5194/bg-2016-357,10.5194/bg-14-2781-2017,True,True,1.0 +10.5194/acp-2020-91,10.5194/acp-20-8641-2020,True,True,0.9826224328593997 +10.5194/hess-2019-600,10.5194/hess-24-4413-2020,True,True, +10.1101/476960,10.15252/embj.2018101153,False,True,0.973765903307888 +10.5194/cpd-10-3327-2014,10.5194/cp-11-327-2015,True,True,1.0 +10.5194/acpd-10-12713-2010,10.5194/acp-10-9039-2010,True,True,0.9947916666666666 +10.5194/acpd-9-24587-2009,10.5194/acp-10-5573-2010,True,True,0.9382716049382716 +10.5194/tc-2016-161,10.5194/tc-10-2981-2016,True,True,1.0 +10.26434/chemrxiv-2022-kgxfk-v2,10.1016/j.eml.2022.101929,False,True,1.0 +10.26434/chemrxiv.8061650.v1,10.1021/acs.chemmater.9b03267,False,True,0.9822281959378736 +10.5194/tc-2019-293,10.5194/tc-14-2775-2020,True,True,1.0 +10.5194/tcd-2-111-2008,10.5194/tc-2-95-2008,True,True,0.9919678714859438 +10.26434/chemrxiv.7322183.v1,10.1021/acs.jctc.8b01041,False,True,1.0 +10.21203/rs.3.rs-2241246/v1,10.1016/j.resconrec.2023.106873,False,True,1.0 +10.26434/chemrxiv.7851587.v1,10.1021/acs.jpcc.8b11092,False,True,1.0 +10.20944/preprints201609.0095.v1,10.1007/s11356-016-8321-6,False,True,0.9662618083670717 +10.5194/bg-2016-172,10.5194/bg-14-597-2017,True,True,1.0 +10.20944/preprints202006.0275.v1,10.3855/jidc.13692,False,True,0.9977324263038548 +10.5194/tc-2021-382,10.5194/tc-16-3313-2022,True,True,1.0 +10.5194/sed-5-789-2013,10.5194/se-4-373-2013,True,True,0.9851380042462846 +10.2196/preprints.19159,10.2196/19159,False,True,1.0 +10.5194/amt-2021-90,10.5194/amt-14-5625-2021,True,True,1.0 +10.21203/rs.3.rs-206773/v1,10.1186/s43058-021-00128-7,False,True,1.0 +10.2196/preprints.10078,10.2196/10078,False,True,1.0 +10.1101/2020.07.06.190314,10.3390/metabo10120488,False,True,1.0 +10.5194/essd-2018-3,10.5194/essd-10-1427-2018,True,True,1.0 +10.7287/peerj.preprints.2795v1,10.7717/peerj.3500,False,True,1.0 +10.31227/osf.io/kxdf6,10.22216/jen.v2i3.2357,False,True,1.0 +10.2196/preprints.40038,10.2196/40038,False,True,1.0 +10.1101/392761,10.1371/journal.pone.0223183,False,True,1.0 +10.2196/preprints.39264,10.2196/39264,False,True,1.0 +10.2196/preprints.33793,10.2196/33793,False,True,1.0 +10.21203/rs.2.12491/v2,10.1186/s13063-019-3833-2,False,True,0.9719974309569684 +10.21203/rs.2.12491/v1,10.1186/s13063-019-3833-2,False,True,0.9645951035781544 +10.21203/rs.3.rs-61509/v1,10.1186/s12944-020-01428-y,False,True,1.0 +10.21203/rs.3.rs-61509/v4,10.1186/s12944-020-01428-y,False,True,1.0 +10.21203/rs.3.rs-61509/v5,10.1186/s12944-020-01428-y,False,True,1.0 +10.21203/rs.3.rs-61509/v2,10.1186/s12944-020-01428-y,False,True,1.0 +10.21203/rs.3.rs-61509/v3,10.1186/s12944-020-01428-y,False,True,1.0 +10.5194/acpd-11-24813-2011,10.5194/acp-12-5429-2012,True,True,0.9028871391076116 +10.5194/amtd-4-3055-2011,10.5194/amt-4-1593-2011,True,True,1.0 +10.21203/rs.3.rs-58058/v2,10.1186/s12909-021-02570-6,False,True,0.9968253968253968 +10.21203/rs.3.rs-58058/v1,10.1186/s12909-021-02570-6,False,True,1.0 +10.20944/preprints201802.0069.v1,10.3390/f9030100,False,True,0.9824561403508772 +10.21203/rs.2.15987/v1,10.1186/s12887-019-1863-2,False,True,1.0 +10.21203/rs.2.15987/v2,10.1186/s12887-019-1863-2,False,True,1.0 +10.21203/rs.2.15987/v3,10.1186/s12887-019-1863-2,False,True,1.0 +10.2196/preprints.12797,10.2196/12797,False,True,1.0 +10.20944/preprints201907.0118.v1,10.3390/ijerph16162815,False,True,1.0 +10.2196/preprints.11824,10.2196/11824,False,True,1.0 +10.5194/hess-2020-46,10.5194/hess-24-5015-2020,True,True,1.0 +10.31219/osf.io/w9unj,10.32520/jtp.v8i2.941,False,True, +10.20944/preprints201908.0123.v1,10.15517/rbt.v68i1.38555,False,True,0.925004016451385 +10.5194/amtd-5-8579-2012,10.5194/amt-6-1359-2013,True,True,0.9941520467836256 +10.5194/acpd-15-19045-2015,10.5194/acp-16-7681-2016,True,True,0.9204142368936375 +10.20944/preprints201804.0244.v1,10.3390/min8050192,False,True,0.9482758620689654 +10.31235/osf.io/7t6w3,10.18523/kmlpj153255.2018-4.99-118,False,True,1.0 +10.5194/essd-2021-239,10.5194/essd-14-3915-2022,True,True,0.996078431372549 +10.5194/acp-2019-639,10.5194/acp-20-4445-2020,True,True,1.0 +10.2196/preprints.14501,10.2196/14501,False,True,1.0 +10.21203/rs.3.rs-132353/v1,10.1186/s13018-020-02191-7,False,True,0.992248062015504 +10.2196/preprints.10665,10.2196/10665,False,True,1.0 +10.5194/gmd-2021-395,10.5194/gmd-15-7557-2022,True,True,0.9958333333333332 +10.5194/acpd-10-10969-2010,10.5194/acp-10-8669-2010,True,False,1.0 +10.5194/amt-2019-481,10.5194/amt-13-3661-2020,True,True,1.0 +10.5194/wes-2020-51,10.5194/wes-5-855-2020,True,True,1.0 +10.5194/npg-2020-4,10.5194/npg-27-391-2020,True,True,1.0 +10.1101/2020.01.09.900050,10.1371/journal.pone.0228121,False,True,1.0 +10.5194/nhess-2016-46,10.5194/nhess-16-1807-2016,True,True,1.0 +10.31219/osf.io/j67kq,10.31014/aior.1991.03.04.241,False,True, +10.31235/osf.io/wsh64,10.31014/aior.1991.03.04.241,False,True, +10.5194/acp-2021-207,10.5194/acp-21-13119-2021,True,True,1.0 +10.21034/wp.415,10.1080/07350015.1990.10509768,False,True, +10.1101/670257,10.1016/j.nlm.2020.107225,False,True,1.0 +10.2196/preprints.18258,10.2196/18258,False,True,1.0 +10.5194/amtd-6-3545-2013,10.5194/amt-6-1981-2013,True,True,1.0 +10.31220/osf.io/pg3v9,10.29255/aksara.v31i2.364.251-268,False,True,1.0 +10.33767/osf.io/y4s3w,10.7560/vlt8102,False,True, +10.20944/preprints202010.0346.v1,10.3390/biom10111564,False,True,1.0 +10.1101/056044,10.1186/s12918-016-0380-2,False,True,1.0 +10.5194/angeo-2019-119,10.5194/angeo-38-467-2020,True,True,1.0 +10.31219/osf.io/vmu6q,10.24269/ars.v6i1.780,False,True,0.8888888888888888 +10.20944/preprints201805.0072.v1,10.3390/electronics7060079,False,True,1.0 +10.31224/osf.io/8s59e,10.1504/ijvp.2017.081276,False,True,0.9213085764809904 +10.20944/preprints201807.0061.v1,10.3390/mti2030044,False,True,1.0 +10.5194/gmd-2016-315,10.5194/gmd-10-1927-2017,True,True,1.0 +10.5194/amt-2019-252,10.5194/amt-13-1735-2020,True,True,1.0 +10.21203/rs.3.rs-41396/v2,10.1186/s12882-020-02158-0,False,True,1.0 +10.21203/rs.3.rs-41396/v1,10.1186/s12882-020-02158-0,False,True,1.0 +10.21203/rs.3.rs-41396/v3,10.1186/s12882-020-02158-0,False,True,1.0 +10.5194/gc-2021-26,10.5194/gc-5-101-2022,True,True,0.9716312056737588 +10.21203/rs.3.rs-104730/v1,10.1186/s12957-021-02152-2,False,True,0.9976359338061466 +10.21203/rs.3.rs-104730/v2,10.1186/s12957-021-02152-2,False,True,0.9976359338061466 +10.1101/634006,10.1093/nargab/lqaa022,False,True,1.0 +10.1101/481952,10.2174/1568026619666181220111059,False,True,0.9923664122137404 +10.21203/rs.3.rs-127854/v1,10.1186/s13019-021-01444-8,False,True,1.0 +10.2196/preprints.17064,10.2196/17064,False,True,1.0 +10.20944/preprints202102.0539.v1,10.3390/molecules26061667,False,True,0.9855072463768116 +10.20944/preprints201701.0068.v1,10.3390/su9010122,False,True,1.0 +10.31219/osf.io/byjhc,10.25046/aj0505120,False,True, +10.20944/preprints202005.0163.v1,10.1186/s41205-020-00086-1,False,True,0.9867724867724867 +10.2196/preprints.12968,10.2196/12968,False,True,1.0 +10.1101/2020.06.26.169458,10.3390/ijms21217980,False,True,0.9946236559139784 +10.2196/preprints.11219,10.2196/11219,False,True,1.0 +10.5194/egusphere-2022-682,10.5194/bg-19-5617-2022,True,True,1.0 +10.21203/rs.2.11941/v2,10.1186/s13104-019-4593-5,False,True,1.0 +10.21203/rs.2.11941/v1,10.1186/s13104-019-4593-5,False,True,1.0 +10.21203/rs.2.11941/v3,10.1186/s13104-019-4593-5,False,True,1.0 +10.1101/460337,10.1093/cercor/bhaa081,False,True,0.9984917043740572 +10.1101/199687,10.1016/j.neuropsychologia.2018.06.010,False,True,1.0 +10.1101/19009589,10.1371/journal.pone.0230274,False,True,1.0 +10.5194/acp-2017-666,10.5194/acp-18-3779-2018,True,True,0.9942857142857144 +10.1101/149716,10.1016/j.ymben.2017.11.011,False,True,0.977777777777778 +10.1101/2020.04.16.044842,10.1523/jneurosci.0875-20.2020,False,True,1.0 +10.5194/acp-2016-998,10.5194/acp-17-4419-2017,True,True,1.0 +10.5194/bg-2019-165,10.5194/bg-16-4097-2019,True,True,0.9865591397849464 +10.5194/bg-2017-173,10.5194/bg-15-953-2018,True,True,1.0 +10.5194/acp-2020-875,10.5194/acp-21-10337-2021,True,True,1.0 +10.31224/osf.io/5atbz,10.1016/j.flowmeasinst.2018.07.003,False,True, +10.1101/251843,10.1523/eneuro.0381-18.2018,False,True,0.9583333333333334 +10.21203/rs.3.rs-253126/v1,10.1002/adpr.202100285,False,True,0.989010989010989 +10.20944/preprints201710.0032.v1,10.3390/environments4040088,False,True,0.9069781480140046 +10.20944/preprints201704.0135.v1,10.3390/ijms18050923,False,True,1.0 +10.1101/2022.01.14.476419,10.1111/mec.16469,False,True,1.0 +10.5194/bgd-4-3343-2007,10.5194/bg-5-371-2008,True,True,0.9743589743589745 +10.5194/amt-2017-408,10.5194/amt-11-3251-2018,True,True,1.0 +10.5194/osd-11-1543-2014,10.5194/os-11-187-2015,True,True,1.0 +10.20944/preprints201906.0228.v1,10.3390/cancers11070942,False,True,0.9832134292565948 +10.5194/cp-2016-46,10.5194/cp-12-1829-2016,True,True,1.0 +10.5194/amt-2017-287,10.5194/amt-11-4465-2018,True,True,1.0 +10.1101/2021.01.28.428594,10.1172/jci.insight.147700,False,True,0.9786096256684492 +10.1101/852434,10.1093/nar/gkaa032,False,True,0.9975308641975308 +10.1101/2021.06.21.449154,10.7554/elife.73153,True,True,1.0 +10.5194/angeo-2018-21,10.5194/angeo-36-891-2018,True,True,0.9968847352024922 +10.31230/osf.io/3b2c9,10.3354/meps12774,False,True, +10.1101/2022.05.17.492323,10.7554/elife.78877,True,False,1.0 +10.5194/osd-11-1213-2014,10.5194/os-10-881-2014,True,True,0.9857295482295484 +10.5194/acp-2020-1095,10.5194/acp-21-5289-2021,True,True,1.0 +10.5194/egusphere-2022-481,10.5194/os-18-1665-2022,True,True,1.0 +10.5194/amt-2020-348,10.5194/amt-14-5349-2021,True,True,0.9696969696969696 +10.31223/osf.io/5wakg,10.1111/j.1365-246x.2006.03017.x,False,True, +10.5194/amt-2020-28,10.5194/amt-13-6559-2020,True,True,1.0 +10.5194/tc-2019-28,10.5194/tc-13-3337-2019,True,True,1.0 +10.1101/2020.12.17.423361,10.7554/elife.66194,True,True,1.0 +10.20944/preprints201808.0242.v1,10.3390/nano8090725,False,True,0.9592592592592591 +10.1101/2021.06.09.21258556,10.1016/s2213-2600(21)00409-4,False,True, +10.5194/hess-2016-505,10.5194/hess-21-765-2017,True,True,1.0 +10.5194/amtd-7-4481-2014,10.5194/amt-7-3549-2014,True,True,1.0 +10.21203/rs.3.rs-2267501/v1,10.1007/s11356-023-27197-6,False,True,0.9909297052154196 diff --git a/fetch_positive_samples.py b/fetch_positive_samples.py new file mode 100644 index 0000000000000000000000000000000000000000..172103fcf27492c9eac29d7364c2bd647cbc847b --- /dev/null +++ b/fetch_positive_samples.py @@ -0,0 +1,37 @@ +from src.dataset.GoodDataset import * + +import argparse + +def main(config): + """ + Main function to process the dataset and save it as a CSV file. + Args: + config: Namespace object containing the script arguments. + """ + # Initialize the dataset + dataset = AugmentedDataset() + + positive_samples = dataset.fetch_positive_samples_parallel( + num_samples=config.size, + random=config.random, + seed=config.seed, + full=config.full + ) + + dataset.save(config.output) + + +if __name__ == "__main__": + # Parse command-line arguments + from src.utils.io_utils import PROJECT_ROOT + parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.") + + parser.add_argument("-s", "--size", type=int, default=10, help="Number of samples to generate.") + parser.add_argument("-r", "--random", type=bool, default=True, help="Whether to sample randomly.") + parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.") + parser.add_argument("--full", action="store_true", help="Boolean flag to indicate full dataset mode.") + parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/dataset.pkl"), help="Output file path to save the dataset as a CSV.") + + # Parse the arguments and pass to the main function + config = parser.parse_args() + main(config) diff --git a/logo.png b/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..80ac9df99ad087ad97a6ea81c5acf9edb0775c0a Binary files /dev/null and b/logo.png differ diff --git a/notebooks/.DS_Store b/notebooks/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/notebooks/.DS_Store differ diff --git a/notebooks/1-0-dataset_development.ipynb b/notebooks/1-0-dataset_development.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f3920020187300295a4967584a4b6f80580929a5 --- /dev/null +++ b/notebooks/1-0-dataset_development.ipynb @@ -0,0 +1,1121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from enum import Enum\n", + "from typing import List, Dict, Any\n", + "from dataclasses import dataclass\n", + "from tqdm import tqdm\n", + "\n", + "import os\n", + "import yaml\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import pyalex\n", + "from pyalex import Works\n", + "from src.utils.io_utils import PROJECT_ROOT\n", + "from src.dataset.Dataset import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Configurations" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class ConfigAugmentation:\n", + " \"\"\"Configuration for OpenAlex features\"\"\"\n", + " basic: Dict[str, bool] = None # id, doi, title, etc\n", + " source: Dict[str, bool] = None # journal info\n", + " authors: Dict[str, bool] = None # author details\n", + " metrics: Dict[str, bool] = None # citations, fwci, etc\n", + " classification: Dict[str, bool] = None # topics, concepts\n", + " access: Dict[str, bool] = None # OA status\n", + " related_works: Dict[str, bool] = None # references\n", + " abstract: bool = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Loading " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# load the dataset \n", + "\n", + "class DatasetType(Enum):\n", + " FULL_RAW = \"full_raw\"\n", + " PARTIAL_RAW = \"partial_raw\"\n", + " FULL_AUGMENTED = \"full_augmented\"\n", + " PARTIAL_AUGMENTED = \"partial_augmented\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class Field:\n", + " \"\"\"Field configuration for data extraction\"\"\"\n", + " name: str\n", + " path: List[str]\n", + " default: Any = None\n", + "\n", + "class AlexFields:\n", + " \"\"\"OpenAlex field definitions\"\"\"\n", + " \n", + " BASIC = [\n", + " Field(\"id\", [\"id\"]),\n", + " Field(\"doi\", [\"doi\"]),\n", + " Field(\"title\", [\"title\"]),\n", + " Field(\"display_name\", [\"display_name\"]),\n", + " Field(\"publication_year\", [\"publication_year\"]),\n", + " Field(\"publication_date\", [\"publication_date\"]),\n", + " Field(\"language\", [\"language\"]),\n", + " Field(\"type\", [\"type\"]),\n", + " Field(\"type_crossref\", [\"type_crossref\"])\n", + " ]\n", + " \n", + " SOURCE = [\n", + " Field(\"journal_name\", [\"primary_location\", \"source\", \"display_name\"]),\n", + " Field(\"issn\", [\"primary_location\", \"source\", \"issn\"]),\n", + " Field(\"issn_l\", [\"primary_location\", \"source\", \"issn_l\"]),\n", + " Field(\"publisher\", [\"primary_location\", \"source\", \"host_organization_name\"]),\n", + " Field(\"type\", [\"primary_location\", \"source\", \"type\"])\n", + " ]\n", + "\n", + " METRICS = [\n", + " Field(\"cited_by_count\", [\"cited_by_count\"]),\n", + " Field(\"cited_by_percentile\", [\"citation_normalized_percentile\"]),\n", + " Field(\"is_retracted\", [\"is_retracted\"]),\n", + " Field(\"fwci\", [\"fwci\"]),\n", + " Field(\"referenced_works_count\", [\"referenced_works_count\"])\n", + " ]\n", + "\n", + " ACCESS = [\n", + " Field(\"is_oa\", [\"open_access\", \"is_oa\"]),\n", + " Field(\"oa_status\", [\"open_access\", \"oa_status\"]),\n", + " Field(\"oa_url\", [\"open_access\", \"oa_url\"]),\n", + " Field(\"pdf_url\", [\"primary_location\", \"pdf_url\"]),\n", + " Field(\"license\", [\"primary_location\", \"license\"]) \n", + " ]\n", + "\n", + "def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any:\n", + " \"\"\"Extract nested value from dictionary using path\"\"\"\n", + " value = data\n", + " for key in path:\n", + " try:\n", + " value = value[key]\n", + " except (KeyError, TypeError):\n", + " return default\n", + " return value\n", + "\n", + "class DataAugmenter:\n", + " \"\"\"Class for augmenting data with OpenAlex features\"\"\"\n", + "\n", + " def __init__(self):\n", + " \"\"\"Initialize augmenter with API credentials\"\"\"\n", + " self.profile = self._load_profile()\n", + " self.email = self.profile[\"email\"]\n", + " self.filters = ConfigAugmentation(\n", + " basic={\n", + " \"id\": True,\n", + " \"doi\": True,\n", + " \"title\": True,\n", + " \"display_name\": True,\n", + " \"publication_year\": True,\n", + " \"publication_date\": True,\n", + " \"language\": True,\n", + " \"type\": True,\n", + " \"type_crossref\": True\n", + " },\n", + " source={\n", + " \"journal_name\": True,\n", + " \"issn\": True,\n", + " \"issn_l\": True,\n", + " \"publisher\": True,\n", + " \"type\": True\n", + " },\n", + " authors={\n", + " \"position\": True,\n", + " \"name\": True,\n", + " \"id\": True,\n", + " \"orcid\": True,\n", + " \"is_corresponding\": True,\n", + " \"affiliations\": False\n", + " },\n", + " metrics={\n", + " \"cited_by_count\": True,\n", + " \"cited_by_percentile\": False,\n", + " \"is_retracted\": True,\n", + " \"fwci\": True,\n", + " \"referenced_works_count\": True\n", + " },\n", + " classification={\n", + " \"primary_topic\": True,\n", + " \"topics\": False,\n", + " \"concepts\": False,\n", + " },\n", + " access={\n", + " \"is_oa\": True,\n", + " \"oa_status\": True,\n", + " \"oa_url\": True,\n", + " \"pdf_url\": True,\n", + " \"license\": True\n", + " },\n", + " related_works={\n", + " \"references\": True,\n", + " \"referenced_by_count\": True,\n", + " \"related\": True\n", + " },\n", + " abstract=True\n", + " )\n", + " \n", + " pyalex.config.email = self.email\n", + " \n", + " def _load_profile(self) -> Dict[str, str]:\n", + " \"\"\"Load API credentials from profile\"\"\"\n", + " profile_path = f\"{PROJECT_ROOT}/user_information/profile.yaml\"\n", + " \n", + " assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n", + " assert os.path.exists(profile_path), \"create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/\"\n", + "\n", + " \n", + " with open(profile_path, \"r\") as f:\n", + " profile = yaml.safe_load(f)\n", + " \n", + " return {\n", + " \"email\": profile[\"email\"]\n", + " }\n", + "\n", + " def get_alex_features(self, doi: str) -> Dict:\n", + " \"\"\"Extract all OpenAlex features for a DOI\"\"\"\n", + " try:\n", + " work = Works()[f\"https://doi.org/{doi}\"]\n", + " result = {}\n", + "\n", + " # Basic metadata\n", + " result[\"basic\"] = {\n", + " field.name: get_nested_value(work, field.path, None)\n", + " for field in AlexFields.BASIC\n", + " }\n", + " \n", + " # Source/journal info\n", + " result[\"source\"] = {\n", + " field.name: get_nested_value(work, field.path, None)\n", + " for field in AlexFields.SOURCE\n", + " }\n", + " \n", + " # Authors with affiliations\n", + " try:\n", + " result[\"authors\"] = [\n", + " {\n", + " \"position\": auth.get(\"author_position\", None),\n", + " \"name\": auth.get(\"author\", {}).get(\"display_name\", None),\n", + " \"id\": auth.get(\"author\", {}).get(\"id\", None),\n", + " \"orcid\": auth.get(\"author\", {}).get(\"orcid\", None),\n", + " \"is_corresponding\": auth.get(\"is_corresponding\", None),\n", + " \"affiliations\": [\n", + " {\n", + " \"name\": inst.get(\"display_name\", None),\n", + " \"id\": inst.get(\"id\", None),\n", + " \"country\": inst.get(\"country_code\", None),\n", + " \"type\": inst.get(\"type\", None),\n", + " \"ror\": inst.get(\"ror\", None)\n", + " }\n", + " for inst in auth.get(\"institutions\", [])\n", + " ]\n", + " }\n", + " for auth in work.get(\"authorships\", [])\n", + " ]\n", + " except:\n", + " result[\"authors\"] = None\n", + "\n", + " # Topics and classifications \n", + " try:\n", + " result[\"classification\"] = {\n", + " \"primary_topic\": {\n", + " \"name\": work.get(\"primary_topic\", {}).get(\"display_name\", None),\n", + " \"score\": work.get(\"primary_topic\", {}).get(\"score\", None),\n", + " \"field\": work.get(\"primary_topic\", {}).get(\"field\", {}).get(\"display_name\", None),\n", + " \"subfield\": work.get(\"primary_topic\", {}).get(\"subfield\", {}).get(\"display_name\", None)\n", + " },\n", + " \"topics\": [\n", + " {\n", + " \"name\": topic.get(\"display_name\", None),\n", + " \"score\": topic.get(\"score\", None),\n", + " \"field\": topic.get(\"field\", {}).get(\"display_name\", None)\n", + " }\n", + " for topic in work.get(\"topics\", [])\n", + " ],\n", + " \"concepts\": [\n", + " {\n", + " \"name\": concept.get(\"display_name\", None),\n", + " \"level\": concept.get(\"level\", None),\n", + " \"score\": concept.get(\"score\", None),\n", + " \"wikidata\": concept.get(\"wikidata\", None)\n", + " }\n", + " for concept in work.get(\"concepts\", [])\n", + " ]\n", + " }\n", + " except:\n", + " result[\"classification\"] = None\n", + "\n", + " # Metrics\n", + " result[\"metrics\"] = {\n", + " field.name: get_nested_value(work, field.path, None)\n", + " for field in AlexFields.METRICS\n", + " }\n", + "\n", + " # Access info\n", + " result[\"access\"] = {\n", + " field.name: get_nested_value(work, field.path, None)\n", + " for field in AlexFields.ACCESS\n", + " }\n", + "\n", + " # Abstract\n", + " try:\n", + " if \"abstract_inverted_index\" in work:\n", + " abstract_dict = work[\"abstract_inverted_index\"]\n", + " if abstract_dict:\n", + " max_pos = max(max(positions) for positions in abstract_dict.values())\n", + " words = [\"\"] * (max_pos + 1)\n", + " for word, positions in abstract_dict.items():\n", + " for pos in positions:\n", + " words[pos] = word\n", + " result[\"abstract\"] = \" \".join(words)\n", + " else:\n", + " result[\"abstract\"] = None\n", + " else:\n", + " result[\"abstract\"] = None\n", + " except:\n", + " result[\"abstract\"] = None\n", + "\n", + " return result\n", + "\n", + " except Exception as e:\n", + " print(f\"OpenAlex error for DOI {doi}: {e}\")\n", + " return {}\n", + " \n", + " def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]:\n", + " \"\"\"Filter data based on configuration\n", + " \n", + " Args:\n", + " data: Dictionary containing raw data\n", + " config: Configuration specifying which features to include\n", + " \n", + " Returns:\n", + " Filtered dictionary containing only the configured features\n", + " \"\"\"\n", + " config = config or self.filters\n", + " \n", + " def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]:\n", + " \"\"\"Filter a section of the data based on the section configuration\"\"\"\n", + " return {k: v for k, v in section_data.items() if k in section_config and section_config[k]}\n", + " \n", + " filtered_data = {}\n", + " \n", + " # Filter OpenAlex data\n", + " alex_filtered = {}\n", + " \n", + " # Basic metadata\n", + " if config.basic:\n", + " alex_filtered[\"basic\"] = filter_section(data.get(\"basic\", {}), config.basic)\n", + " \n", + " # Source/journal info\n", + " if config.source:\n", + " alex_filtered[\"source\"] = filter_section(data.get(\"source\", {}), config.source)\n", + " \n", + " # Authors\n", + " if config.authors:\n", + " authors_data = data.get(\"authors\", [])\n", + " filtered_authors = []\n", + " for author in authors_data:\n", + " filtered_author = filter_section(author, config.authors)\n", + " if config.authors.get(\"affiliations\", False):\n", + " print(author.get(\"affiliations\", []))\n", + " filtered_author[\"affiliations\"] = [\n", + " filter_section(aff, config.authors[\"affiliations\"])\n", + " for aff in author.get(\"affiliations\", [])\n", + " ]\n", + " filtered_authors.append(filtered_author)\n", + " alex_filtered[\"authors\"] = filtered_authors\n", + " \n", + " # Metrics\n", + " if config.metrics:\n", + " alex_filtered[\"metrics\"] = filter_section(data.get(\"metrics\", {}), config.metrics)\n", + " \n", + " # Classification\n", + " if config.classification:\n", + " classification_data = data.get(\"classification\", {})\n", + " alex_filtered[\"classification\"] = {\n", + " k: v for k, v in classification_data.items() if k in config.classification and config.classification[k]\n", + " }\n", + " \n", + " # Access info\n", + " if config.access:\n", + " alex_filtered[\"access\"] = filter_section(data.get(\"access\", {}), config.access)\n", + " \n", + " # Related works\n", + " if config.related_works:\n", + " alex_filtered[\"related_works\"] = filter_section(data.get(\"related_works\", {}), config.related_works)\n", + " \n", + " # Abstract\n", + " if config.abstract and \"abstract\" in data:\n", + " alex_filtered[\"abstract\"] = data[\"abstract\"]\n", + " \n", + " filtered_data = alex_filtered\n", + " \n", + " return filtered_data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'primary_topic': {'name': 'Gait Analysis and Fall Prevention in Elderly',\n", + " 'score': 0.9994,\n", + " 'field': 'Health Professions',\n", + " 'subfield': 'Physical Therapy, Sports Therapy and Rehabilitation'}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doi = \"10.2196/41082\"\n", + "a = DataAugmenter()\n", + "info = a.get_alex_features(doi)\n", + "filtered_info = a.filter_augmented_data(info)\n", + "filtered_info[\"classification\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class FullAugmentedDataset: \n", + "\n", + " def __init__(self):\n", + " self.augmenter = DataAugmenter()\n", + " self.full_raw_dataset = self._load_the_dataset()\n", + "\n", + " def _load_the_dataset(self, type: DatasetType = DatasetType.FULL_RAW) -> pd.DataFrame:\n", + " \"\"\"Load as csv file one of the datasets for training.\"\"\"\n", + " assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n", + " \n", + " if type == DatasetType.FULL_RAW:\n", + " return pd.read_csv(f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\")\n", + "\n", + " def retrieve_dois_couple(self, len: int = 1, random: bool = False, seed: bool = None, full: bool = False):\n", + " \"\"\"Retrieve two DOIs from the dataset\"\"\"\n", + " if random:\n", + " dois = self.full_raw_dataset.sample(n=len, random_state=seed)[[\"preprint_doi\", \"article_doi\"]]\n", + " else:\n", + " dois = self.full_raw_dataset.head(len)[[\"preprint_doi\", \"article_doi\"]]\n", + " if full:\n", + " dois = self.full_raw_dataset[[\"preprint_doi\", \"article_doi\"]]\n", + " return dois.to_numpy()\n", + " \n", + " @staticmethod\n", + " def _flatten_list(lst):\n", + " \"\"\"\n", + " Flattens a nested list into a single list. If the input is not nested, it returns the original list.\n", + " Handles cases where some elements are lists and others are not.\n", + " \"\"\"\n", + " if not isinstance(lst, list): # Ensure the input is a list\n", + " raise ValueError(\"Input must be a list\")\n", + "\n", + " def _flatten(sublist):\n", + " for item in sublist:\n", + " if isinstance(item, list): # Check if the item is a list\n", + " yield from _flatten(item) # Recursively flatten the list\n", + " else:\n", + " yield item # Yield the non-list item\n", + "\n", + " return list(_flatten(lst))\n", + " \n", + " def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.Series:\n", + " \"\"\"Transform filtered augmented data into a pandas Series\n", + " \n", + " Args:\n", + " filtered_data: Dictionary containing filtered OpenAlex and Elsevier data\n", + " preprint: If True, use prpnt_ prefix, else use article_ prefix\n", + " \n", + " Returns:\n", + " pd.Series: Flattened data as a single row\n", + " \"\"\"\n", + "\n", + " additional_part = FullAugmentedDataset.filter_author(filtered_data.get(\"authors\",{}))\n", + " # modify the key of additional part by adding authors_ at the beginning\n", + " additional_part = {f\"authors_{k}\": v for k, v in additional_part.items()} \n", + " # remove authos key from filtreed_info\n", + " filtered_data.pop(\"authors\")\n", + " # append the additional part to the filtered_info\n", + " filtered_data.update(additional_part)\n", + " final_dictionary = FullAugmentedDataset.flatten_dict(filtered_data, preprint=preprint)\n", + "\n", + " for k, v in final_dictionary.items():\n", + " final_dictionary[k] = \"$@$\".join(map(str, FullAugmentedDataset._flatten_list(v))) if isinstance(v, list) else [v]\n", + "\n", + " return pd.DataFrame(final_dictionary)\n", + "\n", + " @staticmethod\n", + " def filter_author(authors_info : list) -> dict:\n", + "\n", + " try:\n", + " relevant_keys = authors_info[0].keys()\n", + " new_dict = {}\n", + " for key in relevant_keys:\n", + " new_dict[key] = [author[key] for author in authors_info]\n", + " return new_dict\n", + " except:\n", + " return {}\n", + " \n", + " @staticmethod\n", + " def flatten_dict(d: dict, parent_key: str = '', sep: str = '_', preprint = True) -> dict:\n", + " \"\"\"Flatten a nested dictionary.\n", + " \n", + " Args:\n", + " d (dict): The dictionary to flatten.\n", + " parent_key (str): The base key string to use for the flattened keys.\n", + " sep (str): The separator to use between parent and child keys.\n", + " \n", + " Returns:\n", + " dict: The flattened dictionary.\n", + " \"\"\"\n", + " addition = \"prpnt_\" if preprint else \"article_\"\n", + " def _flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:\n", + " items = []\n", + " for k, v in d.items():\n", + " new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n", + " if isinstance(v, dict):\n", + " items.extend(_flatten_dict(v, new_key, sep=sep).items())\n", + " else:\n", + " items.append((new_key, v))\n", + " return dict(items)\n", + " return {f\"{addition}{k}\": v for k, v in _flatten_dict(d, parent_key, sep).items()}\n", + "\n", + " def process_pair(self, dois) -> pd.DataFrame:\n", + " \"\"\"Process a pair of DOIs and return combined rows as a DataFrame\"\"\"\n", + " assert len(dois) > 0\n", + " rows = []\n", + " for preprint_doi, article_doi in tqdm(dois):\n", + " # Get preprint features\n", + " preprint_features = self.augmenter.get_alex_features(preprint_doi) # augment with all the features\n", + " preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) # filter the relevant features\n", + " preprint_row = self._augmented_data_to_row(preprint_filtered, True)\n", + "\n", + " # Get article features\n", + " article_features = self.augmenter.get_alex_features(article_doi) # augment with all the features\n", + " article_filtered = self.augmenter.filter_augmented_data(article_features)\n", + " article_row = self._augmented_data_to_row(article_filtered, False)\n", + "\n", + " # print(article_row.columns)\n", + " # print(len(preprint_row.columns))\n", + "\n", + " # combined_row = pd.concat([preprint_row, article_row], axis=1)\n", + " # rows.append(combined_row)\n", + " rows.append([preprint_row, article_row])\n", + "\n", + " return rows\n", + "\n", + " @staticmethod\n", + " def transform_array(input_array, factor):\n", + " output_list = []\n", + " \n", + " for i, row in enumerate(input_array):\n", + " other_indices = np.array([j for j in range(len(input_array)) if j != i])\n", + " sampled_indices = np.random.choice(other_indices, size=factor, replace=False)\n", + " sampled_rows = [input_array[j] for j in sampled_indices]\n", + "\n", + " output_list.append(pd.concat([row[0], row[1], pd.DataFrame(data=[1], columns=['label'])], axis=1))\n", + " for B in sampled_rows:\n", + " output_list.append(pd.concat([row[0], B[1], pd.DataFrame(data=[0], columns=['label'])], axis=1))\n", + "\n", + " return pd.concat(output_list).reset_index(drop=True)\n", + "\n", + " def get_full_dataset(self, len: int = 1, random: bool = True, seed: int = 42, full: bool = True) -> pd.DataFrame:\n", + " \"\"\"Process all DOI pairs and return full dataset\"\"\"\n", + " dois = self.retrieve_dois_couple(len, random, seed, full)\n", + " self.augmented_df = FullAugmentedDataset.transform_array(self.process_pair(dois), factor=3)\n", + " return self.augmented_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TRYING STUFF" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Create dataset with new configs\n", + "dataset = FullAugmentedDataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 2)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dois = dataset.retrieve_dois_couple(5, random = True, seed = 42)\n", + "dois.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "info = dataset.augmenter.get_alex_features(dois[0][0]) # augment with all the features" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'basic': {'id': 'https://openalex.org/W4213260597',\n", + " 'doi': 'https://doi.org/10.31234/osf.io/6fps2',\n", + " 'title': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n", + " 'display_name': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n", + " 'publication_year': 2022,\n", + " 'publication_date': '2022-02-12',\n", + " 'language': 'en',\n", + " 'type': 'preprint',\n", + " 'type_crossref': 'posted-content'},\n", + " 'source': {'journal_name': None,\n", + " 'issn': None,\n", + " 'issn_l': None,\n", + " 'publisher': None,\n", + " 'type': None},\n", + " 'authors': [{'position': 'first',\n", + " 'name': 'Jonathan W. Kelly',\n", + " 'id': 'https://openalex.org/A5011931977',\n", + " 'orcid': 'https://orcid.org/0000-0002-4317-273X',\n", + " 'is_corresponding': True,\n", + " 'affiliations': [{'name': 'Iowa State University',\n", + " 'id': 'https://openalex.org/I173911158',\n", + " 'country': 'US',\n", + " 'type': 'education',\n", + " 'ror': 'https://ror.org/04rswrd78'}]}],\n", + " 'classification': {'primary_topic': {'name': 'Virtual Presence and Embodiment in VR Research',\n", + " 'score': 0.9982,\n", + " 'field': 'Computer Science',\n", + " 'subfield': 'Human-Computer Interaction'},\n", + " 'topics': [{'name': 'Virtual Presence and Embodiment in VR Research',\n", + " 'score': 0.9982,\n", + " 'field': 'Computer Science'},\n", + " {'name': 'Neural Mechanisms of Visual Perception and Processing',\n", + " 'score': 0.9906,\n", + " 'field': 'Neuroscience'},\n", + " {'name': 'Spatial Ability for STEM Domains',\n", + " 'score': 0.9727,\n", + " 'field': 'Engineering'}],\n", + " 'concepts': [{'name': 'Virtual reality',\n", + " 'level': 2,\n", + " 'score': 0.74525213,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q170519'},\n", + " {'name': 'Perception',\n", + " 'level': 2,\n", + " 'score': 0.69497585,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q160402'},\n", + " {'name': 'Optical head-mounted display',\n", + " 'level': 2,\n", + " 'score': 0.64143133,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q17105103'},\n", + " {'name': 'Computer science',\n", + " 'level': 0,\n", + " 'score': 0.4773505,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q21198'},\n", + " {'name': 'Psychology',\n", + " 'level': 0,\n", + " 'score': 0.3757282,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q9418'},\n", + " {'name': 'Computer vision',\n", + " 'level': 1,\n", + " 'score': 0.3722988,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q844240'},\n", + " {'name': 'Artificial intelligence',\n", + " 'level': 1,\n", + " 'score': 0.35102686,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q11660'},\n", + " {'name': 'Neuroscience',\n", + " 'level': 1,\n", + " 'score': 0.0,\n", + " 'wikidata': 'https://www.wikidata.org/wiki/Q207011'}]},\n", + " 'metrics': {'cited_by_count': 6,\n", + " 'cited_by_percentile': {'value': 0.997093,\n", + " 'is_in_top_1_percent': True,\n", + " 'is_in_top_10_percent': True},\n", + " 'is_retracted': False,\n", + " 'fwci': None,\n", + " 'referenced_works_count': 89},\n", + " 'access': {'is_oa': True,\n", + " 'oa_status': 'green',\n", + " 'oa_url': 'https://psyarxiv.com/6fps2/download',\n", + " 'pdf_url': 'https://psyarxiv.com/6fps2/download',\n", + " 'license': None},\n", + " 'abstract': 'Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis of 131 studies describes egocentric distance perception across 20 HMDs, and also examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "info" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_info = dataset.augmenter.filter_augmented_data(info)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prpnt_basic_idprpnt_basic_doiprpnt_basic_titleprpnt_basic_display_nameprpnt_basic_publication_yearprpnt_basic_publication_dateprpnt_basic_languageprpnt_basic_typeprpnt_basic_type_crossrefprpnt_source_journal_name...prpnt_access_oa_statusprpnt_access_oa_urlprpnt_access_pdf_urlprpnt_access_licenseprpnt_abstractprpnt_authors_positionprpnt_authors_nameprpnt_authors_idprpnt_authors_orcidprpnt_authors_is_corresponding
0https://openalex.org/W4213260597https://doi.org/10.31234/osf.io/6fps2Distance perception in virtual reality: A meta...Distance perception in virtual reality: A meta...20222022-02-12enpreprintposted-contentNone...greenhttps://psyarxiv.com/6fps2/downloadhttps://psyarxiv.com/6fps2/downloadNoneDistances are commonly underperceived in virtu...firstJonathan W. Kellyhttps://openalex.org/A5011931977https://orcid.org/0000-0002-4317-273XTrue
\n", + "

1 rows ร— 33 columns

\n", + "
" + ], + "text/plain": [ + " prpnt_basic_id prpnt_basic_doi \\\n", + "0 https://openalex.org/W4213260597 https://doi.org/10.31234/osf.io/6fps2 \n", + "\n", + " prpnt_basic_title \\\n", + "0 Distance perception in virtual reality: A meta... \n", + "\n", + " prpnt_basic_display_name \\\n", + "0 Distance perception in virtual reality: A meta... \n", + "\n", + " prpnt_basic_publication_year prpnt_basic_publication_date \\\n", + "0 2022 2022-02-12 \n", + "\n", + " prpnt_basic_language prpnt_basic_type prpnt_basic_type_crossref \\\n", + "0 en preprint posted-content \n", + "\n", + " prpnt_source_journal_name ... prpnt_access_oa_status \\\n", + "0 None ... green \n", + "\n", + " prpnt_access_oa_url prpnt_access_pdf_url \\\n", + "0 https://psyarxiv.com/6fps2/download https://psyarxiv.com/6fps2/download \n", + "\n", + " prpnt_access_license prpnt_abstract \\\n", + "0 None Distances are commonly underperceived in virtu... \n", + "\n", + " prpnt_authors_position prpnt_authors_name \\\n", + "0 first Jonathan W. Kelly \n", + "\n", + " prpnt_authors_id prpnt_authors_orcid \\\n", + "0 https://openalex.org/A5011931977 https://orcid.org/0000-0002-4317-273X \n", + "\n", + " prpnt_authors_is_corresponding \n", + "0 True \n", + "\n", + "[1 rows x 33 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row = dataset._augmented_data_to_row(filtered_info)\n", + "row" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 5/5 [00:04<00:00, 1.02it/s]\n", + "/var/folders/kp/b80wd80s53l95yjb77jn_l0r0000gn/T/ipykernel_17064/485421214.py:140: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " return pd.concat(output_list).reset_index(drop=True)\n" + ] + } + ], + "source": [ + "df = dataset.get_full_dataset(5, full=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['10.31234/osf.io/6fps2' '10.1109/tvcg.2022.3196606']\n", + " ['10.5194/acpd-11-3071-2011' '10.5194/acp-12-3837-2012']\n", + " ['10.1101/2020.08.07.241687' '10.1021/acscentsci.1c00703']\n", + " ['10.21203/rs.3.rs-62250/v1' '10.1016/j.vetpar.2021.109373']\n", + " ['10.21203/rs.3.rs-2640242/v1' '10.1007/s10499-023-01047-1']]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prpnt_basic_doiarticle_basic_doilabel
0https://doi.org/10.31234/osf.io/6fps2https://doi.org/10.1109/tvcg.2022.31966061
1https://doi.org/10.31234/osf.io/6fps2https://doi.org/10.1007/s10499-023-01047-10
2https://doi.org/10.31234/osf.io/6fps2https://doi.org/10.5194/acp-12-3837-20120
3https://doi.org/10.31234/osf.io/6fps2https://doi.org/10.1016/j.vetpar.2021.1093730
4https://doi.org/10.5194/acpd-11-3071-2011https://doi.org/10.5194/acp-12-3837-20121
5https://doi.org/10.5194/acpd-11-3071-2011https://doi.org/10.1016/j.vetpar.2021.1093730
6https://doi.org/10.5194/acpd-11-3071-2011https://doi.org/10.1109/tvcg.2022.31966060
7https://doi.org/10.5194/acpd-11-3071-2011https://doi.org/10.1007/s10499-023-01047-10
8https://doi.org/10.1101/2020.08.07.241687https://doi.org/10.1021/acscentsci.1c007031
9https://doi.org/10.1101/2020.08.07.241687https://doi.org/10.1007/s10499-023-01047-10
10https://doi.org/10.1101/2020.08.07.241687https://doi.org/10.1109/tvcg.2022.31966060
11https://doi.org/10.1101/2020.08.07.241687https://doi.org/10.1016/j.vetpar.2021.1093730
12https://doi.org/10.21203/rs.3.rs-62250/v1https://doi.org/10.1016/j.vetpar.2021.1093731
13https://doi.org/10.21203/rs.3.rs-62250/v1https://doi.org/10.5194/acp-12-3837-20120
14https://doi.org/10.21203/rs.3.rs-62250/v1https://doi.org/10.1109/tvcg.2022.31966060
15https://doi.org/10.21203/rs.3.rs-62250/v1https://doi.org/10.1021/acscentsci.1c007030
16https://doi.org/10.21203/rs.3.rs-2640242/v1https://doi.org/10.1007/s10499-023-01047-11
17https://doi.org/10.21203/rs.3.rs-2640242/v1https://doi.org/10.5194/acp-12-3837-20120
18https://doi.org/10.21203/rs.3.rs-2640242/v1https://doi.org/10.1016/j.vetpar.2021.1093730
19https://doi.org/10.21203/rs.3.rs-2640242/v1https://doi.org/10.1021/acscentsci.1c007030
\n", + "
" + ], + "text/plain": [ + " prpnt_basic_doi \\\n", + "0 https://doi.org/10.31234/osf.io/6fps2 \n", + "1 https://doi.org/10.31234/osf.io/6fps2 \n", + "2 https://doi.org/10.31234/osf.io/6fps2 \n", + "3 https://doi.org/10.31234/osf.io/6fps2 \n", + "4 https://doi.org/10.5194/acpd-11-3071-2011 \n", + "5 https://doi.org/10.5194/acpd-11-3071-2011 \n", + "6 https://doi.org/10.5194/acpd-11-3071-2011 \n", + "7 https://doi.org/10.5194/acpd-11-3071-2011 \n", + "8 https://doi.org/10.1101/2020.08.07.241687 \n", + "9 https://doi.org/10.1101/2020.08.07.241687 \n", + "10 https://doi.org/10.1101/2020.08.07.241687 \n", + "11 https://doi.org/10.1101/2020.08.07.241687 \n", + "12 https://doi.org/10.21203/rs.3.rs-62250/v1 \n", + "13 https://doi.org/10.21203/rs.3.rs-62250/v1 \n", + "14 https://doi.org/10.21203/rs.3.rs-62250/v1 \n", + "15 https://doi.org/10.21203/rs.3.rs-62250/v1 \n", + "16 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n", + "17 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n", + "18 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n", + "19 https://doi.org/10.21203/rs.3.rs-2640242/v1 \n", + "\n", + " article_basic_doi label \n", + "0 https://doi.org/10.1109/tvcg.2022.3196606 1 \n", + "1 https://doi.org/10.1007/s10499-023-01047-1 0 \n", + "2 https://doi.org/10.5194/acp-12-3837-2012 0 \n", + "3 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n", + "4 https://doi.org/10.5194/acp-12-3837-2012 1 \n", + "5 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n", + "6 https://doi.org/10.1109/tvcg.2022.3196606 0 \n", + "7 https://doi.org/10.1007/s10499-023-01047-1 0 \n", + "8 https://doi.org/10.1021/acscentsci.1c00703 1 \n", + "9 https://doi.org/10.1007/s10499-023-01047-1 0 \n", + "10 https://doi.org/10.1109/tvcg.2022.3196606 0 \n", + "11 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n", + "12 https://doi.org/10.1016/j.vetpar.2021.109373 1 \n", + "13 https://doi.org/10.5194/acp-12-3837-2012 0 \n", + "14 https://doi.org/10.1109/tvcg.2022.3196606 0 \n", + "15 https://doi.org/10.1021/acscentsci.1c00703 0 \n", + "16 https://doi.org/10.1007/s10499-023-01047-1 1 \n", + "17 https://doi.org/10.5194/acp-12-3837-2012 0 \n", + "18 https://doi.org/10.1016/j.vetpar.2021.109373 0 \n", + "19 https://doi.org/10.1021/acscentsci.1c00703 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(dois)\n", + "display(df[['prpnt_basic_doi', 'article_basic_doi', 'label']])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/positive_augmented_dataset.csv b/positive_augmented_dataset.csv new file mode 100644 index 0000000000000000000000000000000000000000..85f3baff8bfbaf88b36748014dc3ac0341f3199f --- /dev/null +++ b/positive_augmented_dataset.csv @@ -0,0 +1,11 @@ +prpnt_basic_id,prpnt_basic_doi,prpnt_basic_title,prpnt_basic_display_name,prpnt_basic_publication_year,prpnt_basic_publication_date,prpnt_basic_language,prpnt_basic_type,prpnt_basic_type_crossref,prpnt_source_journal_name,prpnt_source_issn,prpnt_source_issn_l,prpnt_source_publisher,prpnt_source_type,prpnt_metrics_cited_by_count,prpnt_metrics_is_retracted,prpnt_metrics_fwci,prpnt_metrics_referenced_works_count,prpnt_classification_primary_topic_name,prpnt_classification_primary_topic_score,prpnt_classification_primary_topic_field,prpnt_classification_primary_topic_subfield,prpnt_access_is_oa,prpnt_access_oa_status,prpnt_access_oa_url,prpnt_access_pdf_url,prpnt_access_license,prpnt_abstract,prpnt_authors_position,prpnt_authors_name,prpnt_authors_id,prpnt_authors_orcid,prpnt_authors_is_corresponding,article_basic_id,article_basic_doi,article_basic_title,article_basic_display_name,article_basic_publication_year,article_basic_publication_date,article_basic_language,article_basic_type,article_basic_type_crossref,article_source_journal_name,article_source_issn,article_source_issn_l,article_source_publisher,article_source_type,article_metrics_cited_by_count,article_metrics_is_retracted,article_metrics_fwci,article_metrics_referenced_works_count,article_classification_primary_topic_name,article_classification_primary_topic_score,article_classification_primary_topic_field,article_classification_primary_topic_subfield,article_access_is_oa,article_access_oa_status,article_access_oa_url,article_access_pdf_url,article_access_license,article_abstract,article_authors_position,article_authors_name,article_authors_id,article_authors_orcid,article_authors_is_corresponding +https://openalex.org/W2020559395,https://doi.org/10.5194/acpd-11-3071-2011,A regional chemical transport modeling to identify the influences of biomass burning during 2006 BASE-ASIA,A regional chemical transport modeling to identify the influences of biomass burning during 2006 BASE-ASIA,2011,2011-01-28,en,article,posted-content,,,,,,9,False,1.02,86,Atmospheric Aerosols and their Impacts,1.0,Earth and Planetary Sciences,Atmospheric Science,True,green,https://doi.org/10.5194/acpd-11-3071-2011,,cc-by,"Abstract. To evaluate the impact of biomass burning from Southeast Asia to East Asia, this study conducted numerical simulations during NASA's 2006 Biomass-burning Aerosols in South-East Asia: Smoke Impact Assessment (BASE-ASIA). Two typical episode periods (27โ€“28 March and 13โ€“14 April) were examined. Two emission inventories, FLAMBE and GFED, were used in the simulations. The influences during two episodes in the source region (Southeast Asia) contributed to CO, O3 and PM2.5 concentrations as high as 400 ppbv, 20 ppbv and 80 ฮผg/m3, respectively. The perturbations with and without biomass burning of the above three species were in the range of 10 to 60%, 10 to 20% and 30 to 70%, respectively. The impact due to long-range transport could spread over the southeastern parts of East Asia and could reach about 160 to 360 ppbv, 8 to 18 ppbv and 8 to 64 ฮผg/m3 on CO, O3 and PM2.5, respectively; the percentage impact could reach 20 to 50% on CO, 10 to 30% on O3, and as high as 70% on PM2.5. An impact pattern can be found in April, while the impact becomes slightly broader and goes up to Yangtze River Delta. Two cross-sections at 15ยฐ N and 20ยฐ N were used to compare the vertical flux of biomass burning. In the source region (Southeast Asia), CO, O3 and PM2.5 concentrations had a strong upward tendency from surface to high altitudes. The eastward transport becomes strong from 2 to 8 km in the free troposphere. The subsidence contributed 60 to 70%, 20 to 50%, and 80% on CO, O3 and PM2.5, respectively to surface in the downwind area. The study reveals the significant impact of Southeastern Asia biomass burning on the air quality in both local and downwind areas, particularly during biomass burning episodes. This modeling study might provide constraints of lower limit. An additional study is underway for an active biomass burning year to obtain an upper limit and climate effects.",first,Joshua S. Fu,https://openalex.org/A5036365752,https://orcid.org/0000-0001-5464-9225,False,https://openalex.org/W2042738430,https://doi.org/10.5194/acp-12-3837-2012,Evaluating the influences of biomass burning during 2006 BASE-ASIA: a regional chemical transport modeling,Evaluating the influences of biomass burning during 2006 BASE-ASIA: a regional chemical transport modeling,2012,2012-05-02,en,article,journal-article,Atmospheric chemistry and physics,1680-7316$@$1680-7324,1680-7316,Copernicus Publications,journal,111,False,5.993,68,Atmospheric chemistry and aerosols,1.0,Earth and Planetary Sciences,Atmospheric Science,True,gold,https://acp.copernicus.org/articles/12/3837/2012/acp-12-3837-2012.pdf,https://acp.copernicus.org/articles/12/3837/2012/acp-12-3837-2012.pdf,cc-by,"Abstract. To evaluate the impact of biomass burning from Southeast Asia to East Asia, this study conducted numerical simulations during NASA's 2006 Biomass-burning Aerosols in South-East Asia: Smoke Impact Assessment (BASE-ASIA). Two typical episode periods (27โ€“28 March and 13โ€“14 April) were examined. Two emission inventories, FLAMBE and GFED, were used in the simulations. The influences during two episodes in the source region (Southeast Asia) contributed to the surface CO, O3 and PM2.5 concentrations as high as 400 ppbv, 20 ppbv and 80 ฮผg mโˆ’3, respectively. The perturbations with and without biomass burning of the above three species during the intense episodes were in the range of 10 to 60%, 10 to 20% and 30 to 70%, respectively. The impact due to long-range transport could spread over the southeastern parts of East Asia and could reach about 160 to 360 ppbv, 8 to 18 ppbv and 8 to 64 ฮผg mโˆ’3 on CO, O3 and PM2.5, respectively; the percentage impact could reach 20 to 50% on CO, 10 to 30% on O3, and as high as 70% on PM2.5. In March, the impact of biomass burning mainly concentrated in Southeast Asia and southern China, while in April the impact becomes slightly broader and even could go up to the Yangtze River Delta region. Two cross-sections at 15ยฐ N and 20ยฐ N were used to compare the vertical flux of biomass burning. In the source region (Southeast Asia), CO, O3 and PM2.5 concentrations had a strong upward transport from surface to high altitudes. The eastward transport becomes strong from 2 to 8 km in the free troposphere. The subsidence process during the long-range transport contributed 60 to 70%, 20 to 50%, and 80% on CO, O3 and PM2.5, respectively to surface in the downwind area. The study reveals the significant impact of Southeastern Asia biomass burning on the air quality in both local and downwind areas, particularly during biomass burning episodes. This modeling study might provide constraints of lower limit. An additional study is underway for an active biomass burning year to obtain an upper limit and climate effects.",first,Joshua S. Fu,https://openalex.org/A5036365752,https://orcid.org/0000-0001-5464-9225,False +https://openalex.org/W4213260597,https://doi.org/10.31234/osf.io/6fps2,Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics,Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics,2022,2022-02-12,en,preprint,posted-content,,,,,,6,False,,89,Virtual Reality Applications and Impacts,0.9982,Computer Science,Human-Computer Interaction,True,green,https://psyarxiv.com/6fps2/download,https://psyarxiv.com/6fps2/download,,"Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis of 131 studies describes egocentric distance perception across 20 HMDs, and also examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.",first,Jonathan W. Kelly,https://openalex.org/A5011931977,https://orcid.org/0000-0002-4317-273X,True,https://openalex.org/W4289824348,https://doi.org/10.1109/tvcg.2022.3196606,Distance Perception in Virtual Reality: A Meta-Analysis of the Effect of Head-Mounted Display Characteristics,Distance Perception in Virtual Reality: A Meta-Analysis of the Effect of Head-Mounted Display Characteristics,2022,2022-08-04,en,review,journal-article,IEEE Transactions on Visualization and Computer Graphics,1077-2626$@$1941-0506$@$2160-9306,1077-2626,Institute of Electrical and Electronics Engineers,journal,40,False,2.128,96,Virtual Reality Applications and Impacts,0.9997,Computer Science,Human-Computer Interaction,True,green,https://osf.io/6fps2/download,,,"Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis, based on 137 samples from 61 publications, describes egocentric distance perception across 20 HMDs and examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.",first,Jonathan W. Kelly,https://openalex.org/A5011931977,https://orcid.org/0000-0002-4317-273X,True +https://openalex.org/W3048189011,https://doi.org/10.1101/2020.08.07.241687,Bioorthogonal red and far-red fluorogenic probes for wash-free live-cell and super-resolution microscopy,Bioorthogonal red and far-red fluorogenic probes for wash-free live-cell and super-resolution microscopy,2020,2020-08-07,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,9,False,,46,Click Chemistry in Chemical Biology and Drug Development,0.9999,Chemistry,Organic Chemistry,True,green,https://doi.org/10.1101/2020.08.07.241687,,cc-by-nc-nd,"Abstract Small-molecule fluorophores enable the observation of biomolecules in their native context with fluorescence microscopy. Specific labelling via bioorthogonal tetrazine chemistry confers minimal label size and rapid labelling kinetics. At the same time, fluorogenic tetrazine-dye conjugates exhibit efficient quenching of dyes prior to target binding. However, live-cell compatible long-wavelength fluorophores with strong fluorogenicity have been difficult to realize. Here, we report close proximity tetrazine-dye conjugates with minimal distance between tetrazine and fluorophore. Two synthetic routes give access to a series of cell permeable and impermeable dyes including highly fluorogenic far-red emitting derivatives with electron exchange as dominant excited state quenching mechanism. We demonstrate their potential for live-cell imaging in combination with unnatural amino acids, wash-free multi-colour and super-resolution STED and SOFI imaging. These dyes pave the way for advanced fluorescence imaging of biomolecules with minimal label size.",first,Philipp Werther,https://openalex.org/A5087410446,https://orcid.org/0000-0003-1267-5614,False,https://openalex.org/W3193691807,https://doi.org/10.1021/acscentsci.1c00703,Bio-orthogonal Red and Far-Red Fluorogenic Probes for Wash-Free Live-Cell and Super-resolution Microscopy,Bio-orthogonal Red and Far-Red Fluorogenic Probes for Wash-Free Live-Cell and Super-resolution Microscopy,2021,2021-08-20,en,article,journal-article,ACS Central Science,2374-7943$@$2374-7951,2374-7943,American Chemical Society,journal,80,False,7.957,46,Click Chemistry and Applications,0.9999,Chemistry,Organic Chemistry,True,diamond,https://pubs.acs.org/doi/pdf/10.1021/acscentsci.1c00703,https://pubs.acs.org/doi/pdf/10.1021/acscentsci.1c00703,cc-by,"Small-molecule fluorophores enable the observation of biomolecules in their native context with fluorescence microscopy. Specific labeling via bio-orthogonal tetrazine chemistry combines minimal label size with rapid labeling kinetics. At the same time, fluorogenic tetrazineโ€“dye conjugates exhibit efficient quenching of dyes prior to target binding. However, live-cell compatible long-wavelength fluorophores with strong fluorogenicity have been difficult to realize. Here, we report close proximity tetrazineโ€“dye conjugates with minimal distance between tetrazine and the fluorophore. Two synthetic routes give access to a series of cell-permeable and -impermeable dyes including highly fluorogenic far-red emitting derivatives with electron exchange as the dominant excited-state quenching mechanism. We demonstrate their potential for live-cell imaging in combination with unnatural amino acids, wash-free multicolor and super-resolution STED, and SOFI imaging. These dyes pave the way for advanced fluorescence imaging of biomolecules with minimal label size.",first,Philipp Werther,https://openalex.org/A5087410446,https://orcid.org/0000-0003-1267-5614,False +https://openalex.org/W4244952185,https://doi.org/10.21203/rs.3.rs-62250/v1,Towards Understanding the Liver Fluke Transmission Dynamics on Farms: Detection of Liver Fluke Transmitting Snail and Liver Fluke-Specific Environmental DNA in Water Samples from an Irrigated Dairy Farm in Southeast Australia,Towards Understanding the Liver Fluke Transmission Dynamics on Farms: Detection of Liver Fluke Transmitting Snail and Liver Fluke-Specific Environmental DNA in Water Samples from an Irrigated Dairy Farm in Southeast Australia,2020,2020-08-24,en,preprint,posted-content,Research Square (Research Square),,,Research Square (United States),repository,0,False,,34,Helminth infection and control,0.9875,Veterinary,Small Animals,True,green,https://www.researchsquare.com/article/rs-62250/latest.pdf,https://www.researchsquare.com/article/rs-62250/latest.pdf,cc-by,"Abstract Background Livestock production around the world is impacted by liver fluke ( Fasciola spp.) infection resulting in serious economic losses to the beef, dairy and sheep industries with significant losses of about $90 million per annum in Australia. Liver fluke infection is predominantly controlled by anthelmintic treatment and Triclabendazole (TCBZ) is usually the drug of choice due its superior efficacy against early immature, immature and adult liver fluke stages; however, the widespread emergence of TCBZ resistance in livestock threatens liver fluke control. We are in the urgent need for alternative control measures to lower the exposure of livestock to liver fluke infection which would help to preserve the usefulness of current anthelmintic treatments. Our ability to understand the prevalence of intermediate snail hosts and infective liver fluke stages in the environment is crucial to implement alternative control measures for liver fluke control. However, identification of liver fluke and snails in the environment is hampered by lack of efficient diagnostic methods. Environmental DNA (eDNA) based identification of liver fluke and the intermediate snail host in the water bodies is a promising method to identify liver fluke and snail prevalence on farms. Our aim is to provide a proof of concept to use a molecular tool (quantitative PCR) to detect and quantify eDNA of liver fluke and snail in water bodies on Victorian farming properties for potential large-scale analysis of liver fluke and snail ecology in water bodies. Methods To demonstrate the identification of liver fluke and snail in water bodies, we used a multiplex quantitative PCR assay for the independent but simultaneous detection of eDNA released from snail ( Austropeplea tomentosa) a crucial intermediate snail host for liver fluke transmission in South-east Australia and free-living liver fluke stages ( Fasciola hepatica) . We have collected water samples from an irrigation channel over a period of 11 months in 2016 at a dairy farm located at Maffra, Victoria, South-east Australia and used water samples from selected months (February, March, May, September, October, November and December) for eDNA assay. Results The multiplex qPCR assay effectively allows for the detection and quantification of eDNA released from liver fluke life stages and snails and we observed differential levels of liver fluke and snail specific eDNA in water at the time points analysed in this study. This assay was able to detect 14 fg and 50 pg of liver fluke and snail DNA in the presence of potential inhibitors from field collected water samples. Conclusion The successful detection of eDNA specific to liver fluke and snails from the field collected water samples provides a proof of concept for the use of this method as a monitoring tool to determine the prevalence of liver fluke and liver fluke-transmitting snails in irrigation regions to allow for understanding the liver fluke transmission zones on farms to implement effective control strategies.",first,Vignesh Rathinasamy,https://openalex.org/A5065524774,https://orcid.org/0000-0002-4032-3424,False,https://openalex.org/W3126899836,https://doi.org/10.1016/j.vetpar.2021.109373,Towards understanding the liver fluke transmission dynamics on farms: Detection of liver fluke transmitting snail and liver fluke-specific environmental DNA in water samples from an irrigated dairy farm in Southeast Australia,Towards understanding the liver fluke transmission dynamics on farms: Detection of liver fluke transmitting snail and liver fluke-specific environmental DNA in water samples from an irrigated dairy farm in Southeast Australia,2021,2021-02-03,en,article,journal-article,Veterinary Parasitology,0304-4017$@$1873-2550,0304-4017,Elsevier BV,journal,16,False,1.892,36,Environmental DNA in Biodiversity Studies,0.997,Environmental Science,Ecology,True,green,https://www.researchsquare.com/article/rs-62250/latest.pdf,,,,first,Vignesh Rathinasamy,https://openalex.org/A5065524774,https://orcid.org/0000-0002-4032-3424,False +https://openalex.org/W4324046272,https://doi.org/10.21203/rs.3.rs-2640242/v1,Immunophysiology of tambaqui fed with different levels of dietary protein in a biofloc system and a clear water system,Immunophysiology of tambaqui fed with different levels of dietary protein in a biofloc system and a clear water system,2023,2023-03-13,en,preprint,posted-content,Research Square (Research Square),,,Research Square (United States),repository,0,False,,25,Aquaculture disease management and microbiota,0.9985,Immunology and Microbiology,Immunology,True,green,https://www.researchsquare.com/article/rs-2640242/latest.pdf,https://www.researchsquare.com/article/rs-2640242/latest.pdf,cc-by,"Abstract The present study evaluated the immunophysiological response in Colossoma macropomum fed with different levels of dietary protein in a biofloc system (BFS) and in clear water (CW) and under infection with Aeromonas jandaei . Juvenile tambaqui (9.20 ยฑ 0.23 g) were fed isolipid feed with three levels of crude protein (CP) in the two production systems: BFS24, BFS28 and BFS32 and CW24, CW28 and CW32 with 24, 28 and 32% CP respectively, for 60 days. At the end of the experimental period, the physiological conditions (hematology, biochemistry, hormonal and oxidative stress) of the fish were analyzed. The results of erythrogram, cortisol, glycemia and serum biochemistry (p > 0.05) of the fish did not show significant differences between the breeding systems (BFS and CW) and the different protein levels. Tambaqui raised in the BFS showed monocytosis, thrombocytosis and higher respiratory activity of leukocytes, as well as higher glutathione (GSH) and lower malondialdehyde (MDA) values (p < 0.05). In the bacterial challenge, after induction of aeromonosis, caused by Aeromonas jandaei , greater survival of fish raised in the BFS was observed. The results suggest that, even at lower protein levels, tambaqui maintain physiological homeostasis and, therefore, it may be possible to use up to 24% CP in the diet in the biofloc system. In addition, after the bacterial infection, this system promoted greater immunological resistance in the fish.",first,Michelle Midori Sena Fugimura,https://openalex.org/A5059890817,https://orcid.org/0000-0002-1354-2277,False,https://openalex.org/W4317396516,https://doi.org/10.1007/s10499-023-01047-1,Dietary protein requirement for tambaqui cultivated in biofloc and clear water systems,Dietary protein requirement for tambaqui cultivated in biofloc and clear water systems,2023,2023-01-18,en,article,journal-article,Aquaculture International,0967-6120$@$1573-143X,0967-6120,Springer Science+Business Media,journal,7,False,4.321,50,Metabolism and Nutrition in Aquaculture Feeds,0.9999,Agricultural and Biological Sciences,Aquatic Science,False,closed,,,,,first,Raphael Brito dos Santos,https://openalex.org/A5051287609,https://orcid.org/0000-0003-2168-8759,False +https://openalex.org/W2612690603,https://doi.org/10.1101/135053,Attention is required for knowledge-based sequential grouping of syllables into words,Attention is required for knowledge-based sequential grouping of syllables into words,2017,2017-05-08,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,0,False,,72,Neural dynamics and brain function,0.9991,Neuroscience,Cognitive Neuroscience,True,green,https://www.biorxiv.org/content/biorxiv/early/2017/05/08/135053.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2017/05/08/135053.full.pdf,cc-by,"Abstract How the brain sequentially groups sensory events into temporal chunks and how this process is modulated by attention are fundamental questions in cognitive neuroscience. Sequential grouping includes bottom-up primitive grouping and top-down knowledge-based grouping. In speech perception, grouping acoustic features into syllables can rely on bottom-up acoustic continuity cues but grouping syllables into words critically relies on the listenerโ€™s lexical knowledge. This study investigates whether top-down attention is required to apply lexical knowledge to group syllables into words, by concurrently monitoring neural entrainment to syllables and words using electroencephalography (EEG). When attention is directed to a competing speech stream or cross-modally to a silent movie, neural entrainment to syllables is weakened but neural entrainment to words largely diminishes. These results strongly suggest that knowledge-based grouping of syllables into words requires top-down attention and is a bottleneck for the neural processing of unattended speech.",first,Nai Ding,https://openalex.org/A5008847016,https://orcid.org/0000-0003-3428-2723,False,https://openalex.org/W2778438370,https://doi.org/10.1523/jneurosci.2606-17.2017,Attention Is Required for Knowledge-Based Sequential Grouping: Insights from the Integration of Syllables into Words,Attention Is Required for Knowledge-Based Sequential Grouping: Insights from the Integration of Syllables into Words,2017,2017-12-18,en,article,journal-article,Journal of Neuroscience,0270-6474$@$1529-2401,0270-6474,Society for Neuroscience,journal,80,False,3.31,86,EEG and Brain-Computer Interfaces,0.9986,Neuroscience,Cognitive Neuroscience,True,hybrid,https://www.jneurosci.org/content/jneuro/38/5/1178.full.pdf,https://www.jneurosci.org/content/jneuro/38/5/1178.full.pdf,cc-by-nc-sa,"How the brain groups sequential sensory events into chunks is a fundamental question in cognitive neuroscience. This study investigates whether topโ€“down attention or specific tasks are required for the brain to apply lexical knowledge to group syllables into words. Neural responses tracking the syllabic and word rhythms of a rhythmic speech sequence were concurrently monitored using electroencephalography (EEG). The participants performed different tasks, attending to either the rhythmic speech sequence or a distractor, which was another speech stream or a nonlinguistic auditory/visual stimulus. Attention to speech, but not a lexical-meaning-related task, was required for reliable neural tracking of words, even when the distractor was a nonlinguistic stimulus presented cross-modally. Neural tracking of syllables, however, was reliably observed in all tested conditions. These results strongly suggest that neural encoding of individual auditory events (i.e., syllables) is automatic, while knowledge-based construction of temporal chunks (i.e., words) crucially relies on topโ€“down attention. SIGNIFICANCE STATEMENT Why we cannot understand speech when not paying attention is an old question in psychology and cognitive neuroscience. Speech processing is a complex process that involves multiple stages, e.g., hearing and analyzing the speech sound, recognizing words, and combining words into phrases and sentences. The current study investigates which speech-processing stage is blocked when we do not listen carefully. We show that the brain can reliably encode syllables, basic units of speech sounds, even when we do not pay attention. Nevertheless, when distracted, the brain cannot group syllables into multisyllabic words, which are basic units for speech meaning. Therefore, the process of converting speech sound into meaning crucially relies on attention.",first,Nai Ding,https://openalex.org/A5008847016,https://orcid.org/0000-0003-3428-2723,False +https://openalex.org/W2949817526,https://doi.org/10.1101/037101,Read-Based Phasing of Related Individuals,Read-Based Phasing of Related Individuals,2016,2016-01-18,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,7,False,,26,Text Readability and Simplification,0.9512,Computer Science,Artificial Intelligence,True,green,https://www.biorxiv.org/content/biorxiv/early/2016/01/18/037101.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2016/01/18/037101.full.pdf,,"Abstract Motivation Read-based phasing deduces the haplotypes of an individual from sequencing reads that cover multiple variants, while genetic phasing takes only genotypes as input and applies the rules of Mendelian inheritance to infer haplotypes within a pedigree of individuals. Combining both into an approach that uses these two independent sources of information - reads and pedigree - has the potential to deliver results better than each individually. Results We provide a theoretical framework combining read-based phasing with genetic haplotyping, and describe a fixed-parameter algorithm and its implementation for finding an optimal solution. We show that leveraging reads of related individuals jointly in this way yields more phased variants and at a higher accuracy than when phased separately, both in simulated and real data. Coverages as low as 2ร— for each member of a trio yield haplotypes that are as accurate as when analyzed separately at 15ร— coverage per individual. Availability https://bitbucket.org/whatshap/whatshap (branch pedmec) Contact t.marschall@mpi-inf.mpg.de",first,Shilpa Garg,https://openalex.org/A5060605357,https://orcid.org/0000-0003-0200-4200,False,https://openalex.org/W2420821447,https://doi.org/10.1093/bioinformatics/btw276,Read-based phasing of related individuals,Read-based phasing of related individuals,2016,2016-06-11,en,article,journal-article,Bioinformatics,1367-4803$@$1367-4811,1367-4803,Oxford University Press,journal,44,False,7.736,28,Text Readability and Simplification,0.9452,Computer Science,Artificial Intelligence,True,hybrid,https://academic.oup.com/bioinformatics/article-pdf/32/12/i234/6695672/btw276.pdf,https://academic.oup.com/bioinformatics/article-pdf/32/12/i234/6695672/btw276.pdf,cc-by-nc,"Read-based phasing deduces the haplotypes of an individual from sequencing reads that cover multiple variants, while genetic phasing takes only genotypes as input and applies the rules of Mendelian inheritance to infer haplotypes within a pedigree of individuals. Combining both into an approach that uses these two independent sources of information-reads and pedigree-has the potential to deliver results better than each individually.We provide a theoretical framework combining read-based phasing with genetic haplotyping, and describe a fixed-parameter algorithm and its implementation for finding an optimal solution. We show that leveraging reads of related individuals jointly in this way yields more phased variants and at a higher accuracy than when phased separately, both in simulated and real data. Coverages as low as 2ร— for each member of a trio yield haplotypes that are as accurate as when analyzed separately at 15ร— coverage per individual.https://bitbucket.org/whatshap/whatshapt.marschall@mpi-inf.mpg.de.",first,Shilpa Garg,https://openalex.org/A5060605357,https://orcid.org/0000-0003-0200-4200,False +https://openalex.org/W3163880707,https://doi.org/10.1101/2020.08.10.231720,Cell-specific imputation of drug connectivity mapping with incomplete data,Cell-specific imputation of drug connectivity mapping with incomplete data,2020,2020-08-10,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,4,False,,33,Cell Image Analysis Techniques,0.999,"Biochemistry, Genetics and Molecular Biology",Biophysics,True,green,https://doi.org/10.1101/2020.08.10.231720,,cc-by-nc,"ABSTRACT Motivation Drug repositioning allows expedited discovery of new applications for existing compounds, but re-screening vast compound libraries is often prohibitively expensive. โ€œConnectivity mappingโ€ is a process that links drugs to diseases by identifying compounds whose impact on expression in a collection of cells reverses the diseaseโ€™s impact on expression in disease-relevant tissues. The high throughput LINCS project has expanded the universe of compounds and cell types for which data are available, but even with this effort, many potentially clinically useful combinations are missing. To evaluate the possibility of repurposing drugs this way despite missing data, we compared collaborative filtering with either neighborhood-based or SVD imputation methods to two naive approaches via cross-validation. Results Methods were evaluated for their ability to predict drug connectivity despite missing data. Predictions improved when cell type was taken into account. Neighborhood-based collaborative filtering was the most successful method, with the best improvements in non-immortalized primary cells. We also explored which classes of compounds are most and least reliant on cell type for accurate imputation, and we identified connections between related compounds even when many were not measured in the relevant cells. We conclude that even for cells in which drug responses have not been fully characterized, it is possible to identify unassayed drugs that reverse in those cells the expression signatures observed in disease. Contact donna.slonim@tufts.edu",first,Diana Sapashnik,https://openalex.org/A5041230053,,False,https://openalex.org/W4321003439,https://doi.org/10.1371/journal.pone.0278289,Cell-specific imputation of drug connectivity mapping with incomplete data,Cell-specific imputation of drug connectivity mapping with incomplete data,2023,2023-02-16,en,article,journal-article,PLoS ONE,1932-6203,1932-6203,Public Library of Science,journal,1,False,0.449,33,Computational Methods in Drug Discovery,0.9992,Computer Science,Computational Theory and Mathematics,True,gold,https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0278289&type=printable,https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0278289&type=printable,public-domain,"Drug repositioning allows expedited discovery of new applications for existing compounds, but re-screening vast compound libraries is often prohibitively expensive. โ€œConnectivity mappingโ€ is a process that links drugs to diseases by identifying compounds whose impact on expression in a collection of cells reverses the diseaseโ€™s impact on expression in disease-relevant tissues. The LINCS project has expanded the universe of compounds and cells for which data are available, but even with this effort, many clinically useful combinations are missing. To evaluate the possibility of repurposing drugs despite missing data, we compared collaborative filtering using either neighborhood-based or SVD imputation methods to two naive approaches via cross-validation. Methods were evaluated for their ability to predict drug connectivity despite missing data. Predictions improved when cell type was taken into account. Neighborhood collaborative filtering was the most successful method, with the best improvements in non-immortalized primary cells. We also explored which classes of compounds are most and least reliant on cell type for accurate imputation. We conclude that even for cells in which drug responses have not been fully characterized, it is possible to identify unassayed drugs that reverse in those cells the expression signatures observed in disease.",first,Diana Sapashnik,https://openalex.org/A5041230053,,True +https://openalex.org/W4323306896,https://doi.org/10.36227/techrxiv.22184071.v1,Embedded Pressure Sensing Metamaterials using TPU-Graphene Composites and Additive Manufacturing,Embedded Pressure Sensing Metamaterials using TPU-Graphene Composites and Additive Manufacturing,2023,2023-03-06,en,preprint,posted-content,,,,,,0,False,,52,Advanced Sensor and Energy Harvesting Materials,0.9998,Engineering,Biomedical Engineering,True,green,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,cc-by,"<p>Nearly 15% of the global population is affected by disabilities impacting mobility. Monitoring foot pressure distribution during gait is a fundamental aspect of evaluating rehabilitation. Wearable systems provide a portable alternative to stationary equipment monitoring gait without laboratory space limitations. However, wearable sensors in some applications present challenges in the calibration, sensitivity, and human-sensor interface, requiring application-specific sensors. This study aimed to develop wearable sensors where the structural and material properties can characterise the sensitivity and range of measurement during the design phase. We developed wearable piezoresistive sensors using additive manufacturing to create mechanical metamaterials with embedded pressure-sensing capabilities. The sensors were fabricated in TPU using SLS and graphene ink infusion processes. Three structural designs were developed for different measuring ranges (0 โ€“ 50 N, 0 โ€“ 100 N, and 0 โ€“ 150 N) using body-centred cubic lattices constructed via pyramid unit cells. Two graphene infusion processes were evaluated. We tested the sensors' mechanical and piezoresistive behaviour, measuring the compressive force, strain, and electrical resistance across the sensor. We analysed the influence of structural dimensions and the infusion process on the piezoresistive behaviour. The measuring range was affected mainly by tuneable structural dimensions. The infusion process influenced the piezoresistive sensitivity and affected the linearity response. The results indicate the characterisation of the sensitivity of piezoresistive sensors based on structural parameters and material properties. Mechanical metamaterials could embed pressure sensing in wearables, allowing for customisation based on design parameters using additive manufacture and graphene inks. </p>",first,Inigo Sanz Pena,https://openalex.org/A5041979699,,True,https://openalex.org/W4380303556,https://doi.org/10.1109/jsen.2023.3283460,Embedded Pressure Sensing Metamaterials Using TPU-Graphene Composites and Additive Manufacturing,Embedded Pressure Sensing Metamaterials Using TPU-Graphene Composites and Additive Manufacturing,2023,2023-06-12,en,article,journal-article,IEEE Sensors Journal,1530-437X$@$1558-1748,1530-437X,IEEE Sensors Council,journal,4,False,1.025,54,Wearable Nanogenerator Technology,0.9999,Engineering,Biomedical Engineering,True,green,https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.22184071.v1,,,"Disabilities impacting mobility are a global concern requiring gait rehabilitation, where monitoring foot pressure distribution is fundamental. Wearable systems provide an alternative to stationary equipment eliminating space limitations. However, wearable sensors present challenges in the calibration, sensitivity, and humanโ€“sensor interface, requiring application-specific sensors. This study aimed to develop wearable sensors, where the structural and material properties can characterize the sensitivity and range of measurement during the design phase. We developed wearable piezoresistive sensors using additive manufacturing (AM) to create mechanical metamaterials with embedded pressure-sensing capabilities. Three structural designs were developed for different measuring ranges (0โ€“50 N, 0โ€“100 N, and 0โ€“150 N) using body-centered cubic (BCC) lattices constructed via pyramid unit cells. In addition, two graphene infusion processes were evaluated. We analyzed the influence of structural dimensions and the graphene infusion process on the piezoresistive response of the sensors. The measuring range was affected mainly by tunable structural dimensions, while the infusion process influenced the piezoresistive sensitivity and the linear response. The outcomes in characterizing the piezoresistive sensors based on structural and material properties could allow the development of wearables with embedded pressure sensing with a predictive response solely based on design parameters using AM and graphene inks.",first,Inigo Sanz-Pena,https://openalex.org/A5027816754,https://orcid.org/0000-0002-8282-0648,False +https://openalex.org/W3212535440,https://doi.org/10.1101/2021.11.16.468842,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,2021,2021-11-18,en,preprint,posted-content,bioRxiv (Cold Spring Harbor Laboratory),,,Cold Spring Harbor Laboratory,repository,0,False,,28,Mitochondrial Function and Pathology,1.0,"Biochemistry, Genetics and Molecular Biology",Molecular Biology,True,green,https://www.biorxiv.org/content/biorxiv/early/2022/02/03/2021.11.16.468842.full.pdf,https://www.biorxiv.org/content/biorxiv/early/2022/02/03/2021.11.16.468842.full.pdf,cc-by,"Abstract A sudden increase in permeability of the mitochondrial inner membrane, mitochondrial permeability transition (PT), is the central event responsible for cell death and tissue damage in conditions such as stroke and heart attack. PT is caused by the opening of the Cyclosporin A (CSA) dependent calcium-induced pore, the Permeability Transition Pore (PTP). The molecular details of PTP are incompletely understood. We utilized a combination of holographic and fluorescent microscopy to assess the contribution of the ATP synthase and Adenine Nucleotide Translocator (ANT) towards PTP. In cells lacking either ATP synthase or ANT, we observed CSA-sensitive membrane depolarization, but not high-conductance PTP. Further, we found that in wild-type cells calcium induced CSA-sensitive depolarization precedes opening of the PTP, which occurred until after nearly complete mitochondrial membrane depolarization. We propose that both ATP synthase and ANT are required for high conductance PTP but not depolarization, which presumably occurs through activation of the low conductance PT, which has a molecular nature that is different from both complexes.",first,Maria Neginskaya,https://openalex.org/A5011920727,https://orcid.org/0000-0001-8490-5218,True,https://openalex.org/W4307726921,https://doi.org/10.1016/j.isci.2022.105447,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,Both ANT and ATPase are essential for mitochondrial permeability transition but not depolarization,2022,2022-10-28,en,article,journal-article,iScience,2589-0042,2589-0042,Cell Press,journal,24,False,3.22,47,ATP Synthase and ATPases Research,1.0,"Biochemistry, Genetics and Molecular Biology",Molecular Biology,True,gold,https://www.cell.com/article/S2589004222017199/pdf,https://www.cell.com/article/S2589004222017199/pdf,cc-by,"An increase in permeability of the mitochondrial inner membrane, mitochondrial permeability transition (PT), is the central event responsible for cell death and tissue damage in conditions such as stroke and heart attack. PT is caused by the cyclosporin A (CSA)-dependent calcium-induced pore, the permeability transition pore (PTP). The molecular details of PTP are incompletely understood. We utilized holographic and fluorescent microscopy to assess the contribution of ATP synthase and adenine nucleotide translocator (ANT) toward PTP. In cells lacking either ATP synthase or ANT, we observed CSA-sensitive membrane depolarization, but not high-conductance PTP. In wild-type cells, calcium-induced CSA-sensitive depolarization preceded opening of PTP, which occurred only after nearly complete mitochondrial membrane depolarization. We propose that both ATP synthase and ANT are required for high-conductance PTP but not depolarization, which presumably occurs through activation of the low-conductance PT, which has a molecular nature that is different from both complexes.",first,Maria Neginskaya,https://openalex.org/A5011920727,https://orcid.org/0000-0001-8490-5218,True diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdf33fb8dd3129856e26c1c906626b1264c8f5f8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,84 @@ +aiofiles==23.2.1 +annotated-types==0.7.0 +anyio==4.7.0 +certifi==2024.8.30 +charset-normalizer==3.4.0 +click==8.1.7 +ConfigArgParse==1.7 +contourpy==1.3.0 +cycler==0.12.1 +diffdist==0.1 +exceptiongroup==1.2.2 +fastapi==0.115.6 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.55.3 +fsspec==2024.10.0 +gitdb==4.0.11 +GitPython==3.1.43 +gradio==4.44.1 +gradio_client==1.3.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.26.5 +idna==3.10 +importlib_resources==6.4.5 +iniconfig==2.0.0 +Jinja2==3.1.4 +joblib==1.4.2 +kiwisolver==1.4.7 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.3 +mdurl==0.1.2 +mpmath==1.3.0 +networkx==3.2.1 +nltk==3.9.1 +numpy==2.0.2 +orjson==3.10.12 +packaging==24.2 +pandas==2.2.3 +pillow==10.4.0 +pluggy==1.5.0 +pyalex==0.15.1 +pydantic==2.10.3 +pydantic_core==2.27.1 +pydub==0.25.1 +Pygments==2.18.0 +pyparsing==3.2.0 +pytest==8.3.4 +python-dateutil==2.9.0.post0 +python-multipart==0.0.19 +pytz==2024.2 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +ruff==0.8.2 +scikit-learn==1.5.2 +scipy==1.13.1 +semantic-version==2.10.0 +shellingham==1.5.4 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.1 +starlette==0.41.3 +strconv==0.4.2 +sympy==1.13.1 +tabulate==0.9.0 +tenacity==9.0.0 +threadpoolctl==3.5.0 +tomli==2.2.1 +tomlkit==0.12.0 +torch==2.5.1 +torchaudio==2.5.1 +torchvision==0.20.1 +tqdm==4.67.1 +typer==0.15.1 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 +uvicorn==0.32.1 +websockets==12.0 +zipp==3.21.0 \ No newline at end of file diff --git a/run_augmenter.py b/run_augmenter.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5f9704fba8b07938d405fe0a6132ff72d078c3 --- /dev/null +++ b/run_augmenter.py @@ -0,0 +1,48 @@ +import pandas as pd +from src.utils.io_utils import PROJECT_ROOT +from src.dataset.GoodDataset import AugmentedDataset +from src.dataset.NegativeSampler import NegativeSampler +from src.utils.struct_utils import * +import os + +class Config: + input = os.path.join(PROJECT_ROOT, "data/positive_samples.pkl") + output = os.path.join(PROJECT_ROOT, "data/negative_samples.pkl") + + seed=42 + + random=True + fuzz_title=True + replace_auth=True + overlap_auth=False + overlap_topic=False + + factor_max=4 + authors_to_consider=1 + overlapping_authors=1 + fuzz_count=1 + +def negative_sampler(optional_path = None, factor = None, type_or_difficulty = None)-> pd.DataFrame: + datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" + # return pd.read_csv(datapath) + dataset = AugmentedDataset() + # datapath = '../data/pos.csv' + dataset.load_csv(datapath) + + sampler = NegativeSampler(dataset) + config = Config() + sampler.create_negative_samples(config) + + return custom_struct_to_df(dataset.negative_samples) + +def positive_sampler(optional_path=None, size=10, random=True, seed=42, full=False): + datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" + dataset = AugmentedDataset(datapath) + dataset.fetch_positive_samples_parallel( + num_samples=size, + random=random, + seed=seed, + full=full + ) + + return custom_struct_to_df(dataset.positive_samples) \ No newline at end of file diff --git a/scrap.txt b/scrap.txt new file mode 100644 index 0000000000000000000000000000000000000000..07058d5ce244ce558a4a230829c99ba0ed2332c6 --- /dev/null +++ b/scrap.txt @@ -0,0 +1,90 @@ +# Template for OpenAlex with all features enabled +alex_template = OpenAlexKeys( + basic={ + "id": True, + "doi": True, + "title": True, + "display_name": True, + "publication_year": True, + "publication_date": True, + "language": True, + "type": True, + "type_crossref": True + }, + source={ + "journal_name": True, + "issn": True, + "issn_l": True, + "publisher": True, + "type": True + }, + authors={ + "position": True, + "name": True, + "id": True, + "orcid": True, + "is_corresponding": True, + "affiliations": True + }, + metrics={ + "cited_by_count": True, + "cited_by_percentile": True, + "is_retracted": True, + "fwci": True, + "referenced_works_count": True + }, + classification={ + "primary_topic": True, + "topics": True, + "concepts": True, + }, + access={ + "is_oa": True, + "oa_status": True, + "oa_url": True, + "pdf_url": True, + "license": True + }, + related_works={ + "references": True, + "referenced_by_count": True, + "related": True + }, + abstract=True +) + +# Template for Elsevier with all features enabled +elsevier_template = ElsevierKeys( + basic={ + "title": True, + "doi": True, + "publication_name": True, + "pub_type": True, + "publication_date": True + }, + biblio={ + "volume": True, + "issue": True, + "pages": True, + "issn": True + }, + authors={ + "given_name": True, + "surname": True, + "affiliations": True, + "is_corresponding": True + }, + abstract=True, + subject_areas=True, + metrics={ + "citation_count": True, + "source_citations": True + }, + funding=True +) + +# Create full configuration +full_config = ConfigAugmentation( + alex=alex_template, + elsevier=elsevier_template +) \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..65bb6986ef73ab9d2972386b5c0f7d340a462993 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/__pycache__/__init__.cpython-311.pyc b/src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48dcd55f5534a1494b5caa45dcd06c7cf63ef9d3 Binary files /dev/null and b/src/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f3714593e24f999995e2d19a3222fa1fb70f5de Binary files /dev/null and b/src/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/__pycache__/__init__.cpython-313.pyc b/src/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12142330995edb2d219006864f70ef2980d02d7b Binary files /dev/null and b/src/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/dataset/DataAugmenter.py b/src/dataset/DataAugmenter.py new file mode 100644 index 0000000000000000000000000000000000000000..99bbbe4c02db354ad2e412e30add35b06a1d79e4 --- /dev/null +++ b/src/dataset/DataAugmenter.py @@ -0,0 +1,344 @@ +from enum import Enum +from typing import List, Dict, Any +from dataclasses import dataclass + +import os +import yaml + +import pyalex +from pyalex import Works +from src.utils.io_utils import PROJECT_ROOT + + +@dataclass +class ConfigAugmentation: + """Configuration for OpenAlex features""" + basic: Dict[str, bool] = None # id, doi, title, etc + source: Dict[str, bool] = None # journal info + authors: Dict[str, bool] = None # author details + metrics: Dict[str, bool] = None # citations, fwci, etc + classification: Dict[str, bool] = None # topics, concepts + access: Dict[str, bool] = None # OA status + related_works: Dict[str, bool] = None # references + abstract: bool = False + +class DatasetType(Enum): + FULL_RAW = "full_raw" + PARTIAL_RAW = "partial_raw" + FULL_AUGMENTED = "full_augmented" + PARTIAL_AUGMENTED = "partial_augmented" + + +@dataclass +class Field: + """Field configuration for data extraction""" + name: str + path: List[str] + default: Any = None + +class AlexFields: + """OpenAlex field definitions""" + + BASIC = [ + Field("id", ["id"]), + Field("doi", ["doi"]), + Field("title", ["title"]), + Field("display_name", ["display_name"]), + Field("publication_year", ["publication_year"]), + Field("publication_date", ["publication_date"]), + Field("language", ["language"]), + Field("type", ["type"]), + Field("type_crossref", ["type_crossref"]) + ] + + SOURCE = [ + Field("journal_name", ["primary_location", "source", "display_name"]), + Field("issn", ["primary_location", "source", "issn"]), + Field("issn_l", ["primary_location", "source", "issn_l"]), + Field("publisher", ["primary_location", "source", "host_organization_name"]), + Field("type", ["primary_location", "source", "type"]) + ] + + METRICS = [ + Field("cited_by_count", ["cited_by_count"]), + Field("cited_by_percentile", ["citation_normalized_percentile"]), + Field("is_retracted", ["is_retracted"]), + Field("fwci", ["fwci"]), + Field("referenced_works_count", ["referenced_works_count"]) + ] + + ACCESS = [ + Field("is_oa", ["open_access", "is_oa"]), + Field("oa_status", ["open_access", "oa_status"]), + Field("oa_url", ["open_access", "oa_url"]), + Field("pdf_url", ["primary_location", "pdf_url"]), + Field("license", ["primary_location", "license"]) + ] + +def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any: + """Extract nested value from dictionary using path""" + value = data + for key in path: + try: + value = value[key] + except (KeyError, TypeError): + return default + return value + +class DataAugmenter: + """Class for augmenting data with OpenAlex features""" + + def __init__(self): + """Initialize augmenter with API credentials""" + self.profile = self._load_profile() + self.email = self.profile["email"] + self.filters = ConfigAugmentation( + basic={ + "id": True, + "doi": True, + "title": True, + "display_name": True, + "publication_year": True, + "publication_date": True, + "language": True, + "type": True, + "type_crossref": True + }, + source={ + "journal_name": True, + "issn": True, + "issn_l": True, + "publisher": True, + "type": True + }, + authors={ + "position": True, + "name": True, + "id": True, + "orcid": True, + "is_corresponding": True, + "affiliations": False + }, + metrics={ + "cited_by_count": True, + "cited_by_percentile": False, + "is_retracted": True, + "fwci": True, + "referenced_works_count": True + }, + classification={ + "primary_topic": True, + "topics": False, + "concepts": False, + }, + access={ + "is_oa": True, + "oa_status": True, + "oa_url": True, + "pdf_url": True, + "license": True + }, + related_works={ + "references": True, + "referenced_by_count": True, + "related": True + }, + abstract=True + ) + + pyalex.config.email = self.email + + def _load_profile(self) -> Dict[str, str]: + """Load API credentials from profile""" + profile_path = f"{PROJECT_ROOT}/user_information/profile.yaml" + + assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder " + assert os.path.exists(profile_path), "create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/" + + + with open(profile_path, "r") as f: + profile = yaml.safe_load(f) + + return { + "email": profile["email"] + } + + def get_alex_features(self, doi: str) -> Dict: + """Extract all OpenAlex features for a DOI""" + try: + work = Works()[f"https://doi.org/{doi}"] + result = {} + + # Basic metadata + result["basic"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.BASIC + } + + # Source/journal info + result["source"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.SOURCE + } + + # Authors with affiliations + try: + result["authors"] = [ + { + "position": auth.get("author_position", None), + "name": auth.get("author", {}).get("display_name", None), + "id": auth.get("author", {}).get("id", None), + "orcid": auth.get("author", {}).get("orcid", None), + "is_corresponding": auth.get("is_corresponding", None), + "affiliations": [ + { + "name": inst.get("display_name", None), + "id": inst.get("id", None), + "country": inst.get("country_code", None), + "type": inst.get("type", None), + "ror": inst.get("ror", None) + } + for inst in auth.get("institutions", []) + ] + } + for auth in work.get("authorships", []) + ] + except: + result["authors"] = None + + # Topics and classifications + try: + result["classification"] = { + "primary_topic": { + "name": work.get("primary_topic", {}).get("display_name", None), + "score": work.get("primary_topic", {}).get("score", None), + "field": work.get("primary_topic", {}).get("field", {}).get("display_name", None), + "subfield": work.get("primary_topic", {}).get("subfield", {}).get("display_name", None) + }, + "topics": [ + { + "name": topic.get("display_name", None), + "score": topic.get("score", None), + "field": topic.get("field", {}).get("display_name", None) + } + for topic in work.get("topics", []) + ], + "concepts": [ + { + "name": concept.get("display_name", None), + "level": concept.get("level", None), + "score": concept.get("score", None), + "wikidata": concept.get("wikidata", None) + } + for concept in work.get("concepts", []) + ] + } + except: + result["classification"] = None + + # Metrics + result["metrics"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.METRICS + } + + # Access info + result["access"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.ACCESS + } + + # Abstract + try: + if "abstract_inverted_index" in work: + abstract_dict = work["abstract_inverted_index"] + if abstract_dict: + max_pos = max(max(positions) for positions in abstract_dict.values()) + words = [""] * (max_pos + 1) + for word, positions in abstract_dict.items(): + for pos in positions: + words[pos] = word + result["abstract"] = " ".join(words) + else: + result["abstract"] = None + else: + result["abstract"] = None + except: + result["abstract"] = None + + return result + + except Exception as e: + print(f"OpenAlex error for DOI {doi}: {e}") + return {} + + def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]: + """Filter data based on configuration + + Args: + data: Dictionary containing raw data + config: Configuration specifying which features to include + + Returns: + Filtered dictionary containing only the configured features + """ + config = config or self.filters + + def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]: + """Filter a section of the data based on the section configuration""" + return {k: v for k, v in section_data.items() if k in section_config and section_config[k]} + + filtered_data = {} + + # Filter OpenAlex data + alex_filtered = {} + + # Basic metadata + if config.basic: + alex_filtered["basic"] = filter_section(data.get("basic", {}), config.basic) + + # Source/journal info + if config.source: + alex_filtered["source"] = filter_section(data.get("source", {}), config.source) + + # Authors + if config.authors: + authors_data = data.get("authors", []) + filtered_authors = [] + for author in authors_data: + filtered_author = filter_section(author, config.authors) + if config.authors.get("affiliations", False): + print(author.get("affiliations", [])) + filtered_author["affiliations"] = [ + filter_section(aff, config.authors["affiliations"]) + for aff in author.get("affiliations", []) + ] + filtered_authors.append(filtered_author) + alex_filtered["authors"] = filtered_authors + + # Metrics + if config.metrics: + alex_filtered["metrics"] = filter_section(data.get("metrics", {}), config.metrics) + + # Classification + if config.classification: + classification_data = data.get("classification", {}) + alex_filtered["classification"] = { + k: v for k, v in classification_data.items() if k in config.classification and config.classification[k] + } + + # Access info + if config.access: + alex_filtered["access"] = filter_section(data.get("access", {}), config.access) + + # Related works + if config.related_works: + alex_filtered["related_works"] = filter_section(data.get("related_works", {}), config.related_works) + + # Abstract + if config.abstract and "abstract" in data: + alex_filtered["abstract"] = data["abstract"] + + filtered_data = alex_filtered + + return filtered_data \ No newline at end of file diff --git a/src/dataset/Dataset.py b/src/dataset/Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b2875a6db4f76539c6a179d50243d5183e89699e --- /dev/null +++ b/src/dataset/Dataset.py @@ -0,0 +1,146 @@ +from src.dataset.DataAugmenter import * +import pandas as pd +from tqdm import tqdm +import numpy as np + +class FullAugmentedDataset: + + def __init__(self): + self.augmenter = DataAugmenter() + self.full_raw_dataset = self._load_the_dataset() + + def _load_the_dataset(self, type: DatasetType = DatasetType.FULL_RAW) -> pd.DataFrame: + """Load as csv file one of the datasets for training.""" + assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder " + + if type == DatasetType.FULL_RAW: + return pd.read_csv(f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv") + + def retrieve_dois_couple(self, len: int = 1, random: bool = False, seed: bool = None, full: bool = False): + """Retrieve two DOIs from the dataset""" + if random: + dois = self.full_raw_dataset.sample(n=len, random_state=seed)[["preprint_doi", "article_doi"]] + else: + dois = self.full_raw_dataset.head(len)[["preprint_doi", "article_doi"]] + if full: + dois = self.full_raw_dataset[["preprint_doi", "article_doi"]] + return dois.to_numpy() + + @staticmethod + def _flatten_list(lst): + """ + Flattens a nested list into a single list. If the input is not nested, it returns the original list. + Handles cases where some elements are lists and others are not. + """ + if not isinstance(lst, list): # Ensure the input is a list + raise ValueError("Input must be a list") + + def _flatten(sublist): + for item in sublist: + if isinstance(item, list): # Check if the item is a list + yield from _flatten(item) # Recursively flatten the list + else: + yield item # Yield the non-list item + + return list(_flatten(lst)) + + def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.Series: + """Transform filtered augmented data into a pandas Series + + Args: + filtered_data: Dictionary containing filtered OpenAlex and Elsevier data + preprint: If True, use prpnt_ prefix, else use article_ prefix + + Returns: + pd.Series: Flattened data as a single row + """ + + additional_part = FullAugmentedDataset.filter_author(filtered_data.get("authors",{})) + # modify the key of additional part by adding authors_ at the beginning + additional_part = {f"authors_{k}": v for k, v in additional_part.items()} + # remove authos key from filtreed_info + filtered_data.pop("authors") + # append the additional part to the filtered_info + filtered_data.update(additional_part) + final_dictionary = FullAugmentedDataset.flatten_dict(filtered_data, preprint=preprint) + + for k, v in final_dictionary.items(): + final_dictionary[k] = "$@$".join(map(str, FullAugmentedDataset._flatten_list(v))) if isinstance(v, list) else [v] + + return pd.DataFrame(final_dictionary) + + @staticmethod + def filter_author(authors_info : list) -> dict: + + try: + relevant_keys = authors_info[0].keys() + new_dict = {} + for key in relevant_keys: + new_dict[key] = [author[key] for author in authors_info] + return new_dict + except: + return {} + + @staticmethod + def flatten_dict(d: dict, parent_key: str = '', sep: str = '_', preprint = True) -> dict: + """Flatten a nested dictionary. + + Args: + d (dict): The dictionary to flatten. + parent_key (str): The base key string to use for the flattened keys. + sep (str): The separator to use between parent and child keys. + + Returns: + dict: The flattened dictionary. + """ + addition = "prpnt_" if preprint else "article_" + def _flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict: + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(_flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + return {f"{addition}{k}": v for k, v in _flatten_dict(d, parent_key, sep).items()} + + def process_pair(self, dois) -> pd.DataFrame: + """Process a pair of DOIs and return combined rows as a DataFrame""" + assert len(dois) > 0 + rows = [] + for preprint_doi, article_doi in tqdm(dois): + # Get preprint features + preprint_features = self.augmenter.get_alex_features(preprint_doi) # augment with all the features + preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) # filter the relevant features + preprint_row = self._augmented_data_to_row(preprint_filtered, True) + + # Get article features + article_features = self.augmenter.get_alex_features(article_doi) # augment with all the features + article_filtered = self.augmenter.filter_augmented_data(article_features) + article_row = self._augmented_data_to_row(article_filtered, False) + + rows.append([preprint_row, article_row]) + + return rows + + @staticmethod + def transform_array(input_array, factor): + output_list = [] + + for i, row in enumerate(input_array): + other_indices = np.array([j for j in range(len(input_array)) if j != i]) + sampled_indices = np.random.choice(other_indices, size=factor, replace=False) + sampled_rows = [input_array[j] for j in sampled_indices] + + output_list.append(pd.concat([row[0], row[1], pd.DataFrame(data=[1], columns=['label'])], axis=1)) + for B in sampled_rows: + output_list.append(pd.concat([row[0], B[1], pd.DataFrame(data=[0], columns=['label'])], axis=1)) + + return pd.concat(output_list).reset_index(drop=True) + + def get_full_dataset(self, len: int = 1, random: bool = True, seed: int = 42, full: bool = True) -> pd.DataFrame: + """Process all DOI pairs and return full dataset""" + dois = self.retrieve_dois_couple(len, random, seed, full) + self.augmented_df = FullAugmentedDataset.transform_array(self.process_pair(dois), factor=4) + return self.augmented_df \ No newline at end of file diff --git a/src/dataset/GoodDataAugmenter.py b/src/dataset/GoodDataAugmenter.py new file mode 100644 index 0000000000000000000000000000000000000000..5c5ca5e7a53a55741a7041cc59b1307c18b9c99c --- /dev/null +++ b/src/dataset/GoodDataAugmenter.py @@ -0,0 +1,361 @@ +from enum import Enum +from typing import List, Dict, Any +from dataclasses import dataclass + +import os +import yaml + +import pyalex +from pyalex import Works +from src.utils.io_utils import PROJECT_ROOT + +import time +from requests.exceptions import RequestException +from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, wait_fixed + + + +@dataclass +class ConfigAugmentation: + """Configuration for OpenAlex features""" + basic: Dict[str, bool] = None # id, doi, title, etc + source: Dict[str, bool] = None # journal info + authors: Dict[str, bool] = None # author details + metrics: Dict[str, bool] = None # citations, fwci, etc + classification: Dict[str, bool] = None # topics, concepts + access: Dict[str, bool] = None # OA status + related_works: Dict[str, bool] = None # references + abstract: bool = False + +class DatasetType(Enum): + FULL_RAW = "full_raw" + PARTIAL_RAW = "partial_raw" + FULL_AUGMENTED = "full_augmented" + PARTIAL_AUGMENTED = "partial_augmented" + + +@dataclass +class Field: + """Field configuration for data extraction""" + name: str + path: List[str] + default: Any = None + +class AlexFields: + """OpenAlex field definitions""" + + BASIC = [ + Field("id", ["id"]), + Field("doi", ["doi"]), + Field("title", ["title"]), + Field("display_name", ["display_name"]), + Field("publication_year", ["publication_year"]), + Field("publication_date", ["publication_date"]), + Field("language", ["language"]), + Field("type", ["type"]), + Field("type_crossref", ["type_crossref"]) + ] + + SOURCE = [ + Field("journal_name", ["primary_location", "source", "display_name"]), + Field("issn", ["primary_location", "source", "issn"]), + Field("issn_l", ["primary_location", "source", "issn_l"]), + Field("publisher", ["primary_location", "source", "host_organization_name"]), + Field("type", ["primary_location", "source", "type"]) + ] + + METRICS = [ + Field("cited_by_count", ["cited_by_count"]), + Field("cited_by_percentile", ["citation_normalized_percentile"]), + Field("is_retracted", ["is_retracted"]), + Field("fwci", ["fwci"]), + Field("referenced_works_count", ["referenced_works_count"]) + ] + + ACCESS = [ + Field("is_oa", ["open_access", "is_oa"]), + Field("oa_status", ["open_access", "oa_status"]), + Field("oa_url", ["open_access", "oa_url"]), + Field("pdf_url", ["primary_location", "pdf_url"]), + Field("license", ["primary_location", "license"]) + ] + +def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any: + """Extract nested value from dictionary using path""" + value = data + for key in path: + try: + value = value[key] + except (KeyError, TypeError): + return default + return value + + +class DataAugmenter: + """Class for augmenting data with OpenAlex features""" + + def __init__(self): + """Initialize augmenter with API credentials""" + self.profile = self._load_profile() + self.email = self.profile["email"] + self.filters = ConfigAugmentation( + basic={ + "id": True, + "doi": True, + "title": True, + "display_name": True, + "publication_year": True, + "publication_date": True, + "language": True, + "type": True, + "type_crossref": True + }, + source={ + "journal_name": True, + "issn": True, + "issn_l": True, + "publisher": True, + "type": True + }, + authors={ + "position": True, + "name": True, + "id": True, + "orcid": True, + "is_corresponding": True, + "affiliations": False + }, + metrics={ + "cited_by_count": True, + "cited_by_percentile": False, + "is_retracted": True, + "fwci": True, + "referenced_works_count": True + }, + classification={ + "primary_topic": True, + "topics": False, + "concepts": False, + }, + access={ + "is_oa": True, + "oa_status": True, + "oa_url": True, + "pdf_url": True, + "license": True + }, + related_works={ + "references": True, + "referenced_by_count": True, + "related": True + }, + abstract=True + ) + + pyalex.config.email = self.email + + def _load_profile(self) -> Dict[str, str]: + """Load API credentials from profile""" + profile_path = f"{PROJECT_ROOT}/user_information/profile.yaml" + + assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder " + assert os.path.exists(profile_path), "create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/" + + + with open(profile_path, "r") as f: + profile = yaml.safe_load(f) + + return { + "email": profile["email"] + } + + @retry( + stop=stop_after_attempt(5), # Retry up to 5 times + wait=wait_exponential(multiplier=1, min=1, max=60), # Exponential backoff, + # wait=wait_fixed(.2), + retry=retry_if_exception_type(RequestException) + ) + def get_alex_features(self, doi: str) -> Dict: + """Extract all OpenAlex features for a DOI""" + try: + work = Works()[f"https://doi.org/{doi}"] + result = {} + + # Basic metadata + result["basic"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.BASIC + } + + # Source/journal info + result["source"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.SOURCE + } + + # Authors with affiliations + try: + result["authors"] = [ + { + "position": auth.get("author_position", None), + "name": auth.get("author", {}).get("display_name", None), + "id": auth.get("author", {}).get("id", None), + "orcid": auth.get("author", {}).get("orcid", None), + "is_corresponding": auth.get("is_corresponding", None), + "affiliations": [ + { + "name": inst.get("display_name", None), + "id": inst.get("id", None), + "country": inst.get("country_code", None), + "type": inst.get("type", None), + "ror": inst.get("ror", None) + } + for inst in auth.get("institutions", []) + ] + } + for auth in work.get("authorships", []) + ] + except: + result["authors"] = None + + # Topics and classifications + try: + result["classification"] = { + "primary_topic": { + "name": work.get("primary_topic", {}).get("display_name", None), + "score": work.get("primary_topic", {}).get("score", None), + "field": work.get("primary_topic", {}).get("field", {}).get("display_name", None), + "subfield": work.get("primary_topic", {}).get("subfield", {}).get("display_name", None) + }, + "topics": [ + { + "name": topic.get("display_name", None), + "score": topic.get("score", None), + "field": topic.get("field", {}).get("display_name", None) + } + for topic in work.get("topics", []) + ], + "concepts": [ + { + "name": concept.get("display_name", None), + "level": concept.get("level", None), + "score": concept.get("score", None), + "wikidata": concept.get("wikidata", None) + } + for concept in work.get("concepts", []) + ] + } + except: + result["classification"] = None + + # Metrics + result["metrics"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.METRICS + } + + # Access info + result["access"] = { + field.name: get_nested_value(work, field.path, None) + for field in AlexFields.ACCESS + } + + # Abstract + try: + if "abstract_inverted_index" in work: + abstract_dict = work["abstract_inverted_index"] + if abstract_dict: + max_pos = max(max(positions) for positions in abstract_dict.values()) + words = [""] * (max_pos + 1) + for word, positions in abstract_dict.items(): + for pos in positions: + words[pos] = word + result["abstract"] = " ".join(words) + else: + result["abstract"] = None + else: + result["abstract"] = None + except: + result["abstract"] = None + + return result + + except Exception as e: + print(f"OpenAlex error for DOI {doi}")#: {e}") + # return {} + raise + + def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]: + """Filter data based on configuration + + Args: + data: Dictionary containing raw data + config: Configuration specifying which features to include + + Returns: + Filtered dictionary containing only the configured features + """ + config = config or self.filters + + def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]: + """Filter a section of the data based on the section configuration""" + if not isinstance(section_data, dict): return {} + return {k: v for k, v in section_data.items() if k in section_config and section_config[k]} + + filtered_data = {} + + # Filter OpenAlex data + alex_filtered = {} + + # Basic metadata + if config.basic: + alex_filtered["basic"] = filter_section(data.get("basic", {}), config.basic) + + # Source/journal info + if config.source: + alex_filtered["source"] = filter_section(data.get("source", {}), config.source) + + # Authors + if config.authors: + authors_data = data.get("authors", []) + filtered_authors = [] + for author in authors_data: + filtered_author = filter_section(author, config.authors) + if config.authors.get("affiliations", False): + affiliations = author.get("affiliations", []) + filtered_author["affiliations"] = [ + filter_section(aff, config.authors["affiliations"]) + for aff in affiliations + ] if affiliations else [] + filtered_authors.append(filtered_author) + alex_filtered["authors"] = filtered_authors + + # Metrics + if config.metrics: + alex_filtered["metrics"] = filter_section(data.get("metrics", {}), config.metrics) + + # Classification + if config.classification: + classification_data = data.get("classification", {}) + alex_filtered["classification"] = { + k: v for k, v in classification_data.items() if k in config.classification and config.classification[k] + } if classification_data else {} + + # Access info + if config.access: + alex_filtered["access"] = filter_section(data.get("access", {}), config.access) + + # Related works + if config.related_works: + alex_filtered["related_works"] = filter_section(data.get("related_works", {}), config.related_works) + + # Abstract + if config.abstract and "abstract" in data: + alex_filtered["abstract"] = data["abstract"] + + filtered_data = alex_filtered + + return filtered_data + + + \ No newline at end of file diff --git a/src/dataset/GoodDataset.py b/src/dataset/GoodDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2e61f77ffef85df7236680df3cca4a56d7b53da3 --- /dev/null +++ b/src/dataset/GoodDataset.py @@ -0,0 +1,248 @@ +from src.dataset.GoodDataAugmenter import * +from src.utils.struct_utils import * +import pandas as pd +from tqdm import tqdm +import numpy as np +from concurrent.futures import ThreadPoolExecutor, as_completed +import pickle as pkl + +class AugmentedDataset: + def __init__(self, path: str = None): + """ + Initializes the AugmentedDataset object. + Loads the dataset and prepares the augmenter for data augmentation tasks. + """ + self.augmenter = DataAugmenter() + self.full_raw_dataset = self._load_the_dataset(path) + self.positive_samples = None + + def _load_the_dataset(self, path: str = None) -> pd.DataFrame: + """ + Load the dataset as a CSV file. + + Args: + type (str): The type of dataset to load (default is 'FULL_RAW'). + + Returns: + pd.DataFrame: The loaded dataset as a pandas DataFrame. + """ + assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", \ + "Please run this script in the project repository folder." + + if not path: + return pd.read_csv(f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv") + + return pd.read_csv(path) + + def sample_dois_pairs( + self, + num_samples: int = 1, + random: bool = False, + seed: int = None, + full: bool = False + ) -> np.ndarray: + """ + Sample DOI pairs from the dataset. + + Args: + num_samples (int): Number of DOI pairs to sample. + random (bool): If True, sample randomly; otherwise, use the top rows. + seed (int): Random seed for reproducibility (used if random=True). + full (bool): If True, return all DOI pairs without sampling. + + Returns: + np.ndarray: The sampled DOI pairs. + """ + seed = seed if seed >= 0 else None + num_samples = min(num_samples, len(self.full_raw_dataset)) + + if full: + sampled_data = self.full_raw_dataset[["preprint_doi", "article_doi"]] + elif random: + sampled_data = self.full_raw_dataset.sample(n=num_samples, random_state=seed)[["preprint_doi", "article_doi"]] + else: + sampled_data = self.full_raw_dataset.iloc[:num_samples][["preprint_doi", "article_doi"]] + + return sampled_data.to_numpy() + + + def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.DataFrame: + """Transform filtered augmented data into a pandas Series + + Args: + filtered_data: Dictionary containing filtered OpenAlex and Elsevier data + preprint: If True, use prpnt_ prefix, else use article_ prefix + + Returns: + pd.Series: Flattened data as a single row + """ + + authors_info = filtered_data.pop("authors", {}) + if authors_info: + additional_part = {f"authors_{k}": v for k, v in authors_info[0].items()} + filtered_data.update(additional_part) + + prefix = "prpnt_" if preprint else "article_" + final_dictionary = {f"{prefix}{k}": v for k, v in flatten_dict(filtered_data).items()} + + for key, value in final_dictionary.items(): + final_dictionary[key] = "$@$".join(map(str, flatten_list(value))) if isinstance(value, list) else [value] + + return pd.DataFrame(final_dictionary) + + def process_pairs(self, dois: np.ndarray) -> List[List[pd.DataFrame]]: + """ + Process pairs of DOIs and return combined rows as a list of DataFrame pairs. + + Args: + dois (np.ndarray): Array of DOI pairs. + + Returns: + List[List[pd.DataFrame]]: List of preprint and article DataFrame pairs. + """ + assert len(dois) > 0, "DOI pairs cannot be empty." + + rows = [] + for preprint_doi, article_doi in tqdm(dois, desc="Processing DOI pairs"): + preprint_features = self.augmenter.get_alex_features(preprint_doi) + article_features = self.augmenter.get_alex_features(article_doi) + + preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) + article_filtered = self.augmenter.filter_augmented_data(article_features) + + preprint_row = self._augmented_data_to_row(preprint_filtered, True) + article_row = self._augmented_data_to_row(article_filtered, False) + + rows.append([preprint_row, article_row]) + + return rows + + def fetch_positive_samples( + self, + num_samples: int = 1, + random: bool = True, + seed: int = 42, + full: bool = True, + ): + """ + Process all DOI pairs and return the full augmented dataset. + + Args: + num_samples (int): Number of DOI pairs to process. + random (bool): Whether to sample DOI pairs randomly. + seed (int): Seed for reproducibility. + full (bool): If True, process the entire dataset. + + Returns: + + """ + dois = self.sample_dois_pairs(num_samples, random, seed, full) + self.positive_samples = self.process_pairs(dois) + return self.positive_samples + + + def process_pairs_parallel(self, dois: np.ndarray, max_workers: int = 4) -> List[List[pd.DataFrame]]: + """ + Process pairs of DOIs in parallel and return combined rows as a list of DataFrame pairs. + + Args: + dois (np.ndarray): Array of DOI pairs. + max_workers (int): Number of threads to use for parallel processing. + + Returns: + List[List[pd.DataFrame]]: List of preprint and article DataFrame pairs. + """ + assert len(dois) > 0, "DOI pairs cannot be empty." + + def process_single_pair(preprint_doi: str, article_doi: str) -> List[pd.DataFrame]: + """ + Process a single DOI pair to extract preprint and article data. + + Args: + preprint_doi (str): DOI for the preprint. + article_doi (str): DOI for the article. + + Returns: + List[pd.DataFrame]: A list containing preprint and article rows. + """ + try: + # Preprint features + preprint_features = self.augmenter.get_alex_features(preprint_doi) + preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) + preprint_row = self._augmented_data_to_row(preprint_filtered, True) + + # Article features + article_features = self.augmenter.get_alex_features(article_doi) + article_filtered = self.augmenter.filter_augmented_data(article_features) + article_row = self._augmented_data_to_row(article_filtered, False) + + return [preprint_row, article_row] + except Exception as e: + print(f"Error processing pair ({preprint_doi}, {article_doi})")#: {e}") + return [] + + rows = [] + + # Use ThreadPoolExecutor for parallel processing + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit tasks to the executor + futures = { + executor.submit(process_single_pair, preprint_doi, article_doi): (preprint_doi, article_doi) + for preprint_doi, article_doi in dois + } + + # Collect results as they complete + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing DOI pairs in parallel"): + try: + result = future.result() + if result: # Append only non-empty results + rows.append(result) + except Exception as e: + doi_pair = futures[future] + # print(f"Error with DOI pair {doi_pair}: {e}") + + return rows + + def fetch_positive_samples_parallel( + self, + num_samples: int = 1, + random: bool = True, + seed: int = 42, + full: bool = True, + ): + """ + Process all DOI pairs and return the full augmented dataset. + + Args: + num_samples (int): Number of DOI pairs to process. + random (bool): Whether to sample DOI pairs randomly. + seed (int): Seed for reproducibility. + full (bool): If True, process the entire dataset. + + Returns: + + """ + dois = self.sample_dois_pairs(num_samples, random, seed, full) + self.positive_samples = self.process_pairs_parallel(dois) + return self.positive_samples + + def augment_dataset( + self, + augmentation_factor: int = 4, + # possible augmenation parameters + ): + self.augmented_df = self.transform_array(self.positive_pairs, factor=augmentation_factor) + + def save(self, path: str): + with open(path, 'wb') as file: + pkl.dump(self.positive_samples, file) + + def load(self, path: str): + with open(path, 'rb') as file: + self.positive_samples = pkl.load(file) + + def save_csv(self, path: str): + custom_struct_to_df(self.positive_samples).to_csv(path) + + def load_csv(self, path: str): + self.positive_samples = df_to_custom_struct(pd.read_csv(path)) diff --git a/src/dataset/NegativeSampler.py b/src/dataset/NegativeSampler.py new file mode 100644 index 0000000000000000000000000000000000000000..57685ba7c3eb10beb930ab3745e9bf66da83d15e --- /dev/null +++ b/src/dataset/NegativeSampler.py @@ -0,0 +1,325 @@ +import random +from typing import List, Dict, Any, Union, Set, Callable +import copy + +import pandas as pd +import numpy as np + +import nltk +from nltk.corpus import words +nltk.download("words") + +from src.dataset.GoodDataset import * + + +def copy_column_value( + df1: pd.DataFrame, + df2: pd.DataFrame, + source_col: str, + target_col:str, + source_transform: Callable[[Any], Any] = lambda x: x +) -> List[pd.DataFrame]: + """ + Copies the value from `source_col` in `df1` to `target_col` in `df2`, + while ensuring that the original DataFrames remain unaltered by + working on deep copies. + + Args: + df1 (pd.DataFrame): The source DataFrame containing the value to copy. + df2 (pd.DataFrame): The target DataFrame where the value will be copied. + source_col (str): The column name in `df1` from which the value will be sourced. + target_col (str): The column name in `df2` where the value will be written. + + Returns: + List[pd.DataFrame]: A list containing the original `df1` and the modified copy of `df2`. + """ + # Create a deepcopy of `df2` to ensure the original DataFrame remains unchanged. + df2_copy = copy.deepcopy(df2) + + # Extract the value from the first row of the specified source column in `df1`. + value_to_copy = df1.iloc[0][source_col] + + # Write the extracted value to the first row of the specified target column in the copied `df2`. + df2_copy.at[0, target_col] = source_transform(value_to_copy) + + return [df1, df2_copy] + +def keep_on_condition( + dataset: List[List[pd.DataFrame]], + column_to_check: str, + indices_to_ignore: Union[List[int], Set[int], int], + function_to_compare: Callable[[Any], bool] +) -> List[List[pd.DataFrame]]: + """ + Filters a dataset based on a column value and ignores specified indices. + + Args: + dataset (List[List[pd.DataFrame]]): The dataset to filter, organized as a list of pairs of DataFrames. + column_to_check (str): The column in the article DataFrame to check for values. + values_to_keep (Union[List[Any], Set[Any], Any]): Values to keep in the filtering process. + indices_to_ignore (Union[List[int], Set[int], int]): Indices to ignore during filtering. + article_transform (Callable[[Any], Any], optional): Transformation function for column values. Defaults to identity. + + Returns: + List[List[pd.DataFrame]]: Filtered dataset. + """ + + # Normalize `indices_to_ignore` to a set + if isinstance(indices_to_ignore, int): + indices_to_ignore = {indices_to_ignore} + elif isinstance(indices_to_ignore, list): + indices_to_ignore = set(indices_to_ignore) + + # Filter dataset + return [ + [preprint, article] + for i, (preprint, article) in enumerate(dataset) + if ( + f"article_{column_to_check}" in article.columns and + function_to_compare(article[f"article_{column_to_check}"].iloc[0]) and + i not in indices_to_ignore + ) + ] + +class NegativeSampler: + # def __init__(self, positive_samples: List[List[pd.DataFrame]]): + def __init__(self, dataset: AugmentedDataset): + """ + Initializes the NegativeSampler with a dataset of preprint-article pairs. + :param positive_samples: List of dictionaries, each containing information about preprints and articles. + """ + self.dataset = dataset + self.positive_samples = dataset.positive_samples + + ### ARGUMENTS for negative sampling here? + + def sample_random( + self, + preprint_index: int, + factor_max: int = 4, + random_state: int = -1, + custom_samples: List[List[pd.DataFrame]] = None + ) -> List[List[pd.DataFrame]]: + """ + Randomly samples a non-matching article to create the negative sample. + :param preprint: The preprint for which to create a negative sample. + :return: A randomly selected negative sample. + """ + if random_state >= 0: + np.random.seed(random_state) + + positive_samples = custom_samples if custom_samples is not None else self.positive_samples + + factor = min(len(positive_samples), factor_max) + assert factor >= 1, "Dataset doesn't contain enough samples" + + # Sample `factor` non-matching articles from the dataset to create the negative samples + other_indices = np.array([j for j in range(len(positive_samples)) if j != preprint_index]) + sampled_indices = np.random.choice(other_indices, size=factor, replace=False) + sampled_rows = [positive_samples[j] for j in sampled_indices] + + if preprint_index < 0: + return sampled_rows + + # Create and return the negative samples using the original preprint and the sampled article + preprint, _ = positive_samples[preprint_index] + return [ + [preprint, non_matching_article] + for _, non_matching_article in sampled_rows + ] + + def fuzz_title( + self, + fuzz_count: int = -1, + custom_samples: List[List[pd.DataFrame]] = None + ) -> List[List[pd.DataFrame]]: + """ + Fuzzes out the title to create the negative sample. Likely changes the abstract and/or authors. + :param preprint: The preprint for which to create a negative sample. + :param fuzz_factor: A threshold for title similarity (0.0 to 1.0). + :return: A negative sample with a fuzzed title. + """ + def replace_with_random_words(text: str, fuzz_count: int = fuzz_count) -> str: + """ + Replaces a specified number of words in the input string with random words + from the NLTK `words` corpus. + + Args: + text (str): The input string to fuzz. + fuzz_count (int): The number of words to replace in the string. + + Returns: + str: The string with random word replacements. + """ + if fuzz_count == -1: + fuzz_count = text.count(" ") // 2 + + # Load the list of English words from the NLTK corpus + word_list = words.words() + + # Split the input text into a list of words + text_words = text.split() + + for _ in range(fuzz_count): + # Randomly pick a word in the text to replace + index_to_replace = random.randint(0, len(text_words) - 1) + + # Replace it with a random word from the NLTK corpus + random_word = random.choice(word_list) + text_words[index_to_replace] = random_word + + # Join the list back into a string and return + return " ".join(text_words) + + return [ + copy_column_value(preprint, article, "prpnt_basic_title", "article_basic_title", replace_with_random_words) + for preprint, article in (custom_samples or self.positive_samples) + ] + + def sample_authors_overlap_random( + self, + custom_samples: List[List[pd.DataFrame]] = None + ) -> List[List[pd.DataFrame]]: + """ + Samples a random non-matching article and replaces its authors with the preprint authors to create the negative sample. + :param preprint: The preprint for which to create a negative sample. + :return: A negative sample with authors replaced. + """ + return [ + copy_column_value(preprint, article, "prpnt_authors_id", "article_authors_id") + for preprint, article in (custom_samples or self.positive_samples) + ] + + def sample_authors_overlap( + self, + preprint_index: int, + factor_max: int = 4, + random_state: int = -1, + authors_to_consider: int = 1, + overlapping_authors: int = 1 + ) -> List[List[pd.DataFrame]]: + """ + Samples a published article with some author overlap to create the negative sample. + :param preprint: The preprint for which to create a negative sample. + :return: A negative sample with some author overlap. + """ + def extract_authors(authors_str: str, authors_to_keep: int = -1) -> list: + """ + Extracts a list of authors from a string, with an optional limit on the number of authors to return. + + Args: + authors_str (str): A string containing authors, expected to include `openalex` and be separated by `$@$`. + authors_to_keep (int, optional): The number of authors to keep. If -1, all authors except the last one are kept. Defaults to -1. + + Returns: + list: A list of authors, truncated to the specified number if `authors_to_keep` is provided. + + Raises: + ValueError: If `authors_str` does not contain the substring `openalex`. + """ + + # Split the authors string into a list using the custom delimiter `$@$` + authors_list = authors_str.split("$@$") + + if not authors_list: + raise ValueError(f"Invalid input: {authors_str}. The string must contain 'openalex'.") + + # Determine how many authors to keep + if authors_to_keep == -1: + authors_to_keep = len(authors_list) # Exclude the last item + + # Return the truncated list of authors + return authors_list[:authors_to_keep] + + suffix = "authors_id" + positive_preprint, _ = self.positive_samples[preprint_index] + preprint_authors = set(extract_authors(positive_preprint[f"prpnt_{suffix}"].iloc[0])) + + + def confirm_overlap(article_authors): + article_authors = set(extract_authors(article_authors, authors_to_consider)) + if len(preprint_authors.intersection(article_authors)) >= overlapping_authors: + print(f"\t{article_authors}") + return len(preprint_authors.intersection(article_authors)) >= overlapping_authors + + # Collect preprint-article pairs where the article has some overlapping authors with the selected preprint. + # Exclude the pair matching the selected preprint to ensure proper functionality of random sampling later. + custom_samples = keep_on_condition( + self.positive_samples, + suffix, + preprint_index, + confirm_overlap + ) + + # If preprint_index == -1, no index is excluded from being sampled by sample_random. + # This is because the indices are derived from the following logic: + # np.array([j for j in range(len(positive_samples)) if j != preprint_index]). + # Since j >= 0 and preprint_index is -1, the condition (j != preprint_index) always evaluates to True. + return [ + (positive_preprint, article) + for _, article in self.sample_random(-1, factor_max, random_state, custom_samples) + ] + + def sample_similar_topic( + self, + preprint_index: int, + factor_max: int = 4, + random_state: int = -1 + ) -> List[List[pd.DataFrame]]: + """ + Samples a non-matching article with the same topic to create the negative sample. + :param preprint: The preprint for which to create a negative sample. + :param topic_key: The key in the dataset that contains the topics. + :return: A negative sample with a similar topic. + """ + suffix = "classification_primary_topic_field" + positive_preprint, positive_article = self.positive_samples[preprint_index] + + # Collect preprint-article pairs where the article shares the same topic as the selected preprint. + # Exclude the pair matching the selected preprint to ensure proper functionality of random sampling later. + custom_samples = keep_on_condition( + self.positive_samples, + suffix, + preprint_index, + lambda x: x == positive_article[f"article_{suffix}"].iloc[0] + ) + + # If preprint_index == -1, no index is excluded from being sampled by sample_random. + # This is because the indices are derived from the following logic: + # np.array([j for j in range(len(positive_samples)) if j != preprint_index]). + # Since j >= 0 and preprint_index is -1, the condition (j != preprint_index) always evaluates to True. + return [ + (positive_preprint, article) + for _, article in self.sample_random(-1, factor_max, random_state, custom_samples) + ] + + def create_negative_samples(self, config): + """ + Generate negative samples based on the configuration. + """ + negative_samples = [] + for preprint_index in tqdm(range(len(self.positive_samples)), desc="Negative Sampling"): + negatives = [] + if config.overlap_auth and not config.overlap_topic: + negatives = self.sample_authors_overlap( + preprint_index, factor_max=config.factor_max, + random_state=config.seed, + authors_to_consider=config.authors_to_consider, + overlapping_authors=config.overlapping_authors + ) + elif config.overlap_topic and not config.overlap_auth: + negatives = self.sample_similar_topic(preprint_index, factor_max=config.factor_max, random_state=config.seed) + elif config.random: + negatives = self.sample_random(preprint_index, factor_max=config.factor_max, random_state=config.seed) + else: + continue + + if config.fuzz_title: + negatives = self.fuzz_title(custom_samples=negatives) + + if config.replace_auth: + negatives = self.sample_authors_overlap_random(negatives) + + negative_samples.extend(negatives) + + self.dataset.negative_samples = negative_samples \ No newline at end of file diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..634d85faae36896beecf5f3c3703d88711c80181 --- /dev/null +++ b/src/dataset/__init__.py @@ -0,0 +1,2 @@ +from .DataAugmenter import * +from .Dataset import * \ No newline at end of file diff --git a/src/dataset/__pycache__/DataAugmenter.cpython-311.pyc b/src/dataset/__pycache__/DataAugmenter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d431e090705ce0f7f9374961541a3e0f2c48e0d Binary files /dev/null and b/src/dataset/__pycache__/DataAugmenter.cpython-311.pyc differ diff --git a/src/dataset/__pycache__/DataAugmenter.cpython-312.pyc b/src/dataset/__pycache__/DataAugmenter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19499a85133ee0fd7be280940a11176800928daa Binary files /dev/null and b/src/dataset/__pycache__/DataAugmenter.cpython-312.pyc differ diff --git a/src/dataset/__pycache__/DataAugmenter.cpython-313.pyc b/src/dataset/__pycache__/DataAugmenter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fad83068065c4831e8c83ef438eaccef8d7801de Binary files /dev/null and b/src/dataset/__pycache__/DataAugmenter.cpython-313.pyc differ diff --git a/src/dataset/__pycache__/Dataset.cpython-312.pyc b/src/dataset/__pycache__/Dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37b6b3267d4c72ed7d19f3bdb23a77e652ec441a Binary files /dev/null and b/src/dataset/__pycache__/Dataset.cpython-312.pyc differ diff --git a/src/dataset/__pycache__/Dataset.cpython-313.pyc b/src/dataset/__pycache__/Dataset.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57d127cc04f8dd87d1c99a8994a75b1bdc5b3a38 Binary files /dev/null and b/src/dataset/__pycache__/Dataset.cpython-313.pyc differ diff --git a/src/dataset/__pycache__/GoodDataAugmenter.cpython-313.pyc b/src/dataset/__pycache__/GoodDataAugmenter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bd4a713ef9fcba19fb466646fbe670ca645160f Binary files /dev/null and b/src/dataset/__pycache__/GoodDataAugmenter.cpython-313.pyc differ diff --git a/src/dataset/__pycache__/GoodDataset.cpython-313.pyc b/src/dataset/__pycache__/GoodDataset.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2c7d2da202ed430113353cd8055deaca7f0f4a3 Binary files /dev/null and b/src/dataset/__pycache__/GoodDataset.cpython-313.pyc differ diff --git a/src/dataset/__pycache__/NegativeSampler.cpython-313.pyc b/src/dataset/__pycache__/NegativeSampler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3942f0e2e320adf0404e3f01af556a9b4579023 Binary files /dev/null and b/src/dataset/__pycache__/NegativeSampler.cpython-313.pyc differ diff --git a/src/dataset/__pycache__/__init__.cpython-311.pyc b/src/dataset/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b467f4c535151190bb4f09ac7c1bdaa2a14b0cf Binary files /dev/null and b/src/dataset/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/dataset/__pycache__/__init__.cpython-312.pyc b/src/dataset/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10577510be4d5b02ae1ae1b4c1c24c13b2f43d30 Binary files /dev/null and b/src/dataset/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/dataset/__pycache__/__init__.cpython-313.pyc b/src/dataset/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b9dc569e6bcb8606f65524bd595847cea4268a4 Binary files /dev/null and b/src/dataset/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/dataset/get_dataset.py b/src/dataset/get_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..98f8df2c556ff83d2650551c232b74c711662780 --- /dev/null +++ b/src/dataset/get_dataset.py @@ -0,0 +1,41 @@ +from src.dataset.DataAugmenter import * +from src.dataset.Dataset import * + +import argparse + +def main(config): + """ + Main function to process the dataset and save it as a CSV file. + Args: + config: Namespace object containing the script arguments. + """ + # Initialize the dataset + dataset = FullAugmentedDataset() + + # Get the dataset with the specified parameters + df = dataset.get_full_dataset( + len=config.size, + random=config.random, + seed=config.seed, + full=config.full + ) + + # Write the resulting DataFrame to a CSV file + df.to_csv(config.output, index=False) + print(f"Dataset successfully saved to {config.output}") + +if __name__ == "__main__": + # Parse command-line arguments + from src.utils.io_utils import PROJECT_ROOT + parser = argparse.ArgumentParser(description="Generate and save a dataset based on the given configuration.") + + parser.add_argument("-s", "--size", type=int, default=10, help="Number of samples to generate.") + parser.add_argument("-r", "--random", type=bool, default=True, help="Whether to sample randomly.") + parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.") + parser.add_argument("--full", action="store_true", help="Boolean flag to indicate full dataset mode.") + parser.add_argument("-o", "--output", type=str, default=os.path.join(PROJECT_ROOT, "data/out.csv"), help="Output file path to save the dataset as a CSV.") + + # Parse the arguments and pass to the main function + config = parser.parse_args() + print(config.full) + main(config) diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..11e3e12c0bbae517b1eb6a0e8e7f6da88c91cb3e --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +from .io_utils import * \ No newline at end of file diff --git a/src/utils/__pycache__/__init__.cpython-311.pyc b/src/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a56dd448fd08d4d7d30b7593b42f6a1b7d239f2 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/utils/__pycache__/__init__.cpython-312.pyc b/src/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a0a11cdef411185fe4c4f28895ebde3e18c5b3e Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/utils/__pycache__/__init__.cpython-313.pyc b/src/utils/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28d01231c3fc2a4b3309f5311f0fa1fd2fc0c7e2 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-313.pyc differ diff --git a/src/utils/__pycache__/io_utils.cpython-311.pyc b/src/utils/__pycache__/io_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d31c474b50e7e1a3b4be2b9fdba3fca87babb7f Binary files /dev/null and b/src/utils/__pycache__/io_utils.cpython-311.pyc differ diff --git a/src/utils/__pycache__/io_utils.cpython-312.pyc b/src/utils/__pycache__/io_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11e7134d0e72b4a98cbbcd9d1e75e116a2203b24 Binary files /dev/null and b/src/utils/__pycache__/io_utils.cpython-312.pyc differ diff --git a/src/utils/__pycache__/io_utils.cpython-313.pyc b/src/utils/__pycache__/io_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98db512e79199a8e56ef99a483cc908488cdacfd Binary files /dev/null and b/src/utils/__pycache__/io_utils.cpython-313.pyc differ diff --git a/src/utils/__pycache__/struct_utils.cpython-313.pyc b/src/utils/__pycache__/struct_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16e108ad0e8cd1054f8430b5b898e7f6b9d8b6c8 Binary files /dev/null and b/src/utils/__pycache__/struct_utils.cpython-313.pyc differ diff --git a/src/utils/io_utils.py b/src/utils/io_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0bd7f2ec5042d21e023f73c8a5846a3609c1dee0 --- /dev/null +++ b/src/utils/io_utils.py @@ -0,0 +1,46 @@ +import git +import os +from pathlib import Path +from enum import Enum +from typing import List, Tuple, Optional +import pandas as pd +from dataclasses import dataclass +import os +import yaml +from dataclasses import dataclass +from typing import Dict, Optional, List +import pandas as pd +import yaml +from pyalex import Works +import json + + +try: + PROJECT_ROOT = Path( + git.Repo(Path.cwd(), search_parent_directories=True).working_dir + ) +except git.exc.InvalidGitRepositoryError: + PROJECT_ROOT = Path.cwd() + +os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT) + +def get_profile_information() -> Dict[str, str]: + """ + Get the email and the api_key from the profile file + """ + + # create the path to the profile file + profile = f"{PROJECT_ROOT}/user_information/profile.yaml" + + # check if the profile exists and if you are running inside the correct folder + assert str(PROJECT_ROOT).split("/")[-1] == "MatchingPubs", "Please run this script in the github repo folder " + assert os.path.exists(profile), "create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/" + + # load the profile file + with open(profile, "r") as f: + profile = yaml.safe_load(f) + + # access the email and the api_key + email = profile["email"] + api_key = profile["api_key"].strip() # remove the spaces + return {"email": email, "api_key": api_key} diff --git a/src/utils/struct_utils.py b/src/utils/struct_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6e9d6773685d32d63a7f0a2b2d5a659275b29813 --- /dev/null +++ b/src/utils/struct_utils.py @@ -0,0 +1,104 @@ +def flatten_list(lst): + """ + Flattens a nested list into a single list. If the input is not nested, it returns the original list. + Handles cases where some elements are lists and others are not. + """ + if not isinstance(lst, list): + raise ValueError("You must provide a valid list") + + def _flatten(sublist): + for item in sublist: + if isinstance(item, list): + yield from _flatten(item) + else: + yield item + + return list(_flatten(lst)) + +def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict: + """Flatten a nested dictionary efficiently. + + Args: + d (dict): The dictionary to flatten. + parent_key (str): The base key string to use for the flattened keys. + sep (str): The separator to use between parent and child keys. + + Returns: + dict: The flattened dictionary. + """ + if not isinstance(d, dict): + raise ValueError("You must provide a valid dictionary.") + + def _flatten(d, parent_key): + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + yield from _flatten(v, new_key) + else: + yield new_key, v + + return dict(_flatten(d, parent_key)) + + +def filter_dict_by_keys(original_dict, relevant_keys): + """ + Filters a dictionary to include only the key-value pairs where the key is in relevant_keys. + + Args: + original_dict (dict): The dictionary to filter. + relevant_keys (set): The set of keys to keep. + + Returns: + dict: A filtered dictionary containing only the relevant key-value pairs. + """ + return {key: original_dict[key] for key in relevant_keys if key in original_dict} + + +from typing import List +import pandas as pd + +def custom_struct_to_df(samples: List[List[pd.DataFrame]]): + """ + Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame. + + Args: + samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of: + - A preprint DataFrame (e.g., containing information about preprints). + - An article DataFrame (e.g., containing information about corresponding articles). + + Returns: + pd.DataFrame: A single DataFrame where: + - Each row corresponds to a preprint-article pair. + - Preprint columns retain their prefix (e.g., 'prpnt'). + - Article columns retain their prefix (e.g., 'article'). + - Index is reset for the entire DataFrame. + """ + return pd.concat([ + pd.concat([preprint, article], axis=1) + for preprint, article in samples + ]).reset_index(drop=True) + +def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]: + """ + Converts a DataFrame with prefixed columns (prpnt for preprint, article for article) + into a list of pairs of DataFrames. + + Args: + df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`. + + Returns: + List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article]. + """ + # Split columns into preprint and article based on prefixes + preprint_columns = [col for col in df.columns if col.startswith("prpnt")] + article_columns = [col for col in df.columns if col.startswith("article")] + + # Separate the DataFrame into two DataFrames for preprint and article + preprint_df = df[preprint_columns].copy() + article_df = df[article_columns].copy() + + # Combine rows into pairs of DataFrames + return [ + [preprint_df.iloc[[i]], article_df.iloc[[i]]] # Use iloc to get each row as a DataFrame + for i in range(len(df)) + ] diff --git a/user_information/profile.yaml b/user_information/profile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9898141328839e47e559ab9d12517ef676a956ab --- /dev/null +++ b/user_information/profile.yaml @@ -0,0 +1,2 @@ +email: "menca1999@gmail.com" +api_key: "9387db24a61b80b921abda716265d0bd" \ No newline at end of file