{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-07-07 17:13:01.457105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import pandas as pd\n", "import gensim\n", "import pprint\n", "from gensim import corpora\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import TfidfModel\n", "from gensim.parsing import strip_tags, strip_numeric, \\\n", " strip_multiple_whitespaces, stem_text, strip_punctuation, \\\n", " remove_stopwords, preprocess_string\n", "import re\n", "import os\n", "\n", "from typing import List\n", "import spacy" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "transform_to_lower = lambda s: s.lower()\n", "remove_single_char = lambda s: re.sub(r'\\s+\\w{1}\\s+', '', s)\n", "\n", "cleaning_filters = [\n", " strip_tags,\n", " strip_numeric,\n", " strip_punctuation, \n", " strip_multiple_whitespaces, \n", " transform_to_lower,\n", " remove_stopwords,\n", " remove_single_char\n", "]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\")" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "638707" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "int(df.shape[0] * 0.75) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\").sample().reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "submitter | \n", "authors | \n", "title | \n", "comments | \n", "journal-ref | \n", "doi | \n", "report-no | \n", "categories | \n", "license | \n", "abstract | \n", "versions | \n", "update_date | \n", "authors_parsed | \n", "cleaned_abstracts | \n", "len_abstract | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2007.00905 | \n", "Song Qingheng | \n", "Qingheng Song, Yong Zeng, Jie Xu, and Shi Jin | \n", "A Survey of Prototype and Experiment for UAV C... | \n", "24 pages, 6 figures | \n", "None | \n", "None | \n", "None | \n", "cs.IT eess.SP math.IT | \n", "http://creativecommons.org/licenses/by-nc-sa/4.0/ | \n", "Unmanned aerial vehicle (UAV) communications... | \n", "[{'created': 'Thu, 2 Jul 2020 06:26:20 GMT', '... | \n", "2020-07-03 | \n", "[[Song, Qingheng, ], [Zeng, Yong, ], [Xu, Jie,... | \n", "unmanned aerial vehicle uav communication a... | \n", "865 | \n", "
1 | \n", "2102.04209 | \n", "Michael Stuart | \n", "Michael T. Stuart and Markus Kneer | \n", "Guilty Artificial Minds | \n", "20 pages, 4 figures, 1 table | \n", "None | \n", "None | \n", "None | \n", "cs.CY cs.AI cs.HC | \n", "http://creativecommons.org/licenses/by/4.0/ | \n", "The concepts of blameworthiness and wrongnes... | \n", "[{'created': 'Sun, 24 Jan 2021 21:37:35 GMT', ... | \n", "2021-02-09 | \n", "[[Stuart, Michael T., ], [Kneer, Markus, ]] | \n", "concept blameworthiness wrongness fundament... | \n", "739 | \n", "
2 | \n", "1201.5796 | \n", "Denis Jerome | \n", "Denis Jerome | \n", "Organic Superconductors: when correlations and... | \n", "41 pages, 21 figures to be published in Journa... | \n", "None | \n", "10.1007/s10948-012-1475-7 | \n", "None | \n", "cond-mat.supr-con | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "This survey provides a brief account for the... | \n", "[{'created': 'Fri, 27 Jan 2012 15:24:46 GMT', ... | \n", "2012-02-21 | \n", "[[Jerome, Denis, ]] | \n", "survey provide brief account start organic ... | \n", "649 | \n", "
3 | \n", "1511.03076 | \n", "Emma Platts Miss | \n", "George F.R. Ellis, Emma Platts, David Sloan an... | \n", "Current observations with a decaying cosmologi... | \n", "23 pages, 11 figures | \n", "None | \n", "10.1088/1475-7516/2016/04/026 | \n", "None | \n", "astro-ph.CO gr-qc hep-th | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We use the phase plane analysis technique of... | \n", "[{'created': 'Tue, 10 Nov 2015 12:08:23 GMT', ... | \n", "2016-04-27 | \n", "[[Ellis, George F. R., ], [Platts, Emma, ], [S... | \n", "use phase plane analysis technique madsen e... | \n", "554 | \n", "
4 | \n", "1710.02954 | \n", "Kirk Bansak | \n", "Kirk Bansak | \n", "Estimating Causal Moderation Effects with Rand... | \n", "Forthcoming, Journal of the Royal Statistical ... | \n", "None | \n", "None | \n", "None | \n", "stat.ME | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "Researchers are often interested in analyzin... | \n", "[{'created': 'Mon, 9 Oct 2017 06:34:01 GMT', '... | \n", "2020-08-25 | \n", "[[Bansak, Kirk, ]] | \n", "researcher interested analyze conditional t... | \n", "799 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
851605 | \n", "1301.0707 | \n", "Sebastian Klein | \n", "Sebastian Klein | \n", "Chow groups of tensor triangulated categories | \n", "40 pages. The presentation of the article has ... | \n", "None | \n", "None | \n", "None | \n", "math.AG math.CT math.RT | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We recall P. Balmer's definition of tensor t... | \n", "[{'created': 'Fri, 4 Jan 2013 11:06:40 GMT', '... | \n", "2015-10-02 | \n", "[[Klein, Sebastian, ]] | \n", "recall p. balmer definition tensor triangul... | \n", "787 | \n", "
851606 | \n", "1707.00341 | \n", "Giorgos Anastasiou | \n", "Giorgos Anastasiou, Rodrigo Olea, David Rivera... | \n", "Noether-Wald energy in Critical Gravity | \n", "7 pages, no figures, Final version for PLB | \n", "None | \n", "10.1016/j.physletb.2018.11.021 | \n", "None | \n", "hep-th gr-qc | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "Criticality represents a specific point in t... | \n", "[{'created': 'Sun, 2 Jul 2017 19:52:32 GMT', '... | \n", "2018-11-21 | \n", "[[Anastasiou, Giorgos, ], [Olea, Rodrigo, ], [... | \n", "criticality represent specific point parame... | \n", "631 | \n", "
851607 | \n", "1610.08526 | \n", "Blagoje Oblak | \n", "Blagoje Oblak | \n", "BMS Particles in Three Dimensions | \n", "437 pages (including index), 33 figures. Appen... | \n", "None | \n", "10.1007/978-3-319-61878-4 | \n", "None | \n", "hep-th gr-qc math-ph math.GR math.MP math.RT | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "This thesis is devoted to the group-theoreti... | \n", "[{'created': 'Wed, 26 Oct 2016 20:00:16 GMT', ... | \n", "2018-01-29 | \n", "[[Oblak, Blagoje, ]] | \n", "thesis devoted group theoretic aspect dimen... | \n", "542 | \n", "
851608 | \n", "1211.6629 | \n", "Philippe Joyez | \n", "Philippe Joyez | \n", "Self-consistent dynamics of a Josephson juncti... | \n", "7 pages, 1 figure | \n", "None | \n", "10.1103/PhysRevLett.110.217003 | \n", "None | \n", "cond-mat.supr-con cond-mat.mes-hall | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We derive microscopically the dynamics assoc... | \n", "[{'created': 'Tue, 27 Nov 2012 17:29:04 GMT', ... | \n", "2013-05-29 | \n", "[[Joyez, Philippe, ]] | \n", "derive microscopically dynamic associate d.... | \n", "558 | \n", "
851609 | \n", "0705.2878 | \n", "Benoit Perthame | \n", "Benoit Perthame (DMA), Panagiotis E. Souganidis | \n", "Asymmetric potentials and motor effect: a larg... | \n", "None | \n", "None | \n", "None | \n", "None | \n", "math.AP | \n", "None | \n", "We provide a mathematical analysis of appear... | \n", "[{'created': 'Sun, 20 May 2007 17:43:39 GMT', ... | \n", "2007-05-23 | \n", "[[Perthame, Benoit, , DMA], [Souganidis, Panag... | \n", "provide mathematical analysis appearance co... | \n", "518 | \n", "
851610 rows × 16 columns
\n", "