{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-06-24 16:49:13.031488: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import pandas as pd\n", "import numpy\n", "from matplotlib import pyplot as plt\n", "from typing import List, Dict\n", "from collections import Counter\n", "from pprint import pprint\n", "\n", "import seaborn as sns\n", "sns.set_style(\"darkgrid\")\n", "sns.set_palette(\"mako\")\n", "\n", "import spacy\n", "from spacy.lang.en import English\n", "from nltk.corpus import stopwords\n", "\n", "nlp = spacy.load('en_core_web_sm')\n", "\n", "pd.set_option('display.float_format', '{:.2f}'.format)\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "spacy.lang.en.English" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(nlp)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df_raw = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers_raw.parquet.gzip\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "submitter | \n", "authors | \n", "title | \n", "comments | \n", "journal-ref | \n", "doi | \n", "report-no | \n", "categories | \n", "license | \n", "abstract | \n", "versions | \n", "update_date | \n", "authors_parsed | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0704.0001 | \n", "Pavel Nadolsky | \n", "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", "Calculation of prompt diphoton production cros... | \n", "37 pages, 15 figures; published version | \n", "Phys.Rev.D76:013009,2007 | \n", "10.1103/PhysRevD.76.013009 | \n", "ANL-HEP-PR-07-12 | \n", "hep-ph | \n", "None | \n", "A fully differential calculation in perturba... | \n", "[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... | \n", "2008-11-26 | \n", "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", "
1 | \n", "0704.0002 | \n", "Louis Theran | \n", "Ileana Streinu and Louis Theran | \n", "Sparsity-certifying Graph Decompositions | \n", "To appear in Graphs and Combinatorics | \n", "None | \n", "None | \n", "None | \n", "math.CO cs.CG | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We describe a new algorithm, the $(k,\\ell)$-... | \n", "[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... | \n", "2008-12-13 | \n", "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", "
2 | \n", "0704.0003 | \n", "Hongjun Pan | \n", "Hongjun Pan | \n", "The evolution of the Earth-Moon system based o... | \n", "23 pages, 3 figures | \n", "None | \n", "None | \n", "None | \n", "physics.gen-ph | \n", "None | \n", "The evolution of Earth-Moon system is descri... | \n", "[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... | \n", "2008-01-13 | \n", "[[Pan, Hongjun, ]] | \n", "
3 | \n", "0704.0004 | \n", "David Callan | \n", "David Callan | \n", "A determinant of Stirling cycle numbers counts... | \n", "11 pages | \n", "None | \n", "None | \n", "None | \n", "math.CO | \n", "None | \n", "We show that a determinant of Stirling cycle... | \n", "[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... | \n", "2007-05-23 | \n", "[[Callan, David, ]] | \n", "
4 | \n", "0704.0005 | \n", "Alberto Torchinsky | \n", "Wael Abu-Shammala and Alberto Torchinsky | \n", "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", "None | \n", "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", "None | \n", "None | \n", "math.CA math.FA | \n", "None | \n", "In this paper we show how to compute the $\\L... | \n", "[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... | \n", "2013-10-15 | \n", "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", "