diff --git "a/notebooks/eda.ipynb" "b/notebooks/eda.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/eda.ipynb" @@ -0,0 +1,15371 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-06-15 21:45:58.717469: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy\n", + "from matplotlib import pyplot as plt\n", + "from typing import List, Dict\n", + "from collections import Counter\n", + "from pprint import pprint\n", + "\n", + "import seaborn as sns\n", + "sns.set_style(\"darkgrid\")\n", + "sns.set_palette(\"mako\")\n", + "\n", + "import spacy\n", + "from spacy.lang.en import English\n", + "from nltk.corpus import stopwords\n", + "\n", + "nlp = English()\n", + "\n", + "pd.set_option('display.float_format', '{:.2f}'.format)\n", + "# sns.color_palette(\"rocket\", as_cmap=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df_raw = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers_raw.parquet.gzip\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "abstract | \n", + "versions | \n", + "update_date | \n", + "authors_parsed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0704.0001 | \n", + "Pavel Nadolsky | \n", + "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", + "Calculation of prompt diphoton production cros... | \n", + "37 pages, 15 figures; published version | \n", + "Phys.Rev.D76:013009,2007 | \n", + "10.1103/PhysRevD.76.013009 | \n", + "ANL-HEP-PR-07-12 | \n", + "hep-ph | \n", + "None | \n", + "A fully differential calculation in perturba... | \n", + "[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... | \n", + "2008-11-26 | \n", + "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", + "
1 | \n", + "0704.0002 | \n", + "Louis Theran | \n", + "Ileana Streinu and Louis Theran | \n", + "Sparsity-certifying Graph Decompositions | \n", + "To appear in Graphs and Combinatorics | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO cs.CG | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "We describe a new algorithm, the $(k,\\ell)$-... | \n", + "[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... | \n", + "2008-12-13 | \n", + "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", + "
2 | \n", + "0704.0003 | \n", + "Hongjun Pan | \n", + "Hongjun Pan | \n", + "The evolution of the Earth-Moon system based o... | \n", + "23 pages, 3 figures | \n", + "None | \n", + "None | \n", + "None | \n", + "physics.gen-ph | \n", + "None | \n", + "The evolution of Earth-Moon system is descri... | \n", + "[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... | \n", + "2008-01-13 | \n", + "[[Pan, Hongjun, ]] | \n", + "
3 | \n", + "0704.0004 | \n", + "David Callan | \n", + "David Callan | \n", + "A determinant of Stirling cycle numbers counts... | \n", + "11 pages | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO | \n", + "None | \n", + "We show that a determinant of Stirling cycle... | \n", + "[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... | \n", + "2007-05-23 | \n", + "[[Callan, David, ]] | \n", + "
4 | \n", + "0704.0005 | \n", + "Alberto Torchinsky | \n", + "Wael Abu-Shammala and Alberto Torchinsky | \n", + "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", + "None | \n", + "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", + "None | \n", + "None | \n", + "math.CA math.FA | \n", + "None | \n", + "In this paper we show how to compute the $\\L... | \n", + "[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... | \n", + "2013-10-15 | \n", + "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2268247 | \n", + "supr-con/9608008 | \n", + "Ruslan Prozorov | \n", + "R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... | \n", + "On the origin of the irreversibility line in t... | \n", + "19 pages, LaTex, 6 PostScript figures; Author'... | \n", + "None | \n", + "10.1103/PhysRevB.54.15530 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "We report on measurements of the angular dep... | \n", + "[{'created': 'Mon, 26 Aug 1996 15:08:35 GMT', ... | \n", + "2009-10-30 | \n", + "[[Prozorov, R., ], [Konczykowski, M., ], [Schm... | \n", + "
2268248 | \n", + "supr-con/9609001 | \n", + "Durga P. Choudhury | \n", + "Durga P. Choudhury, Balam A. Willemsen, John S... | \n", + "Nonlinear Response of HTSC Thin Film Microwave... | \n", + "4 pages, LaTeX type, Uses IEEE style files, 60... | \n", + "None | \n", + "10.1109/77.620744 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The non-linear microwave surface impedance o... | \n", + "[{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ... | \n", + "2016-11-18 | \n", + "[[Choudhury, Durga P., , Physics Department, N... | \n", + "
2268249 | \n", + "supr-con/9609002 | \n", + "Durga P. Choudhury | \n", + "Balam A. Willemsen, J. S. Derov and S.Sridhar ... | \n", + "Critical State Flux Penetration and Linear Mic... | \n", + "20 pages, LaTeX type, Uses REVTeX style files,... | \n", + "None | \n", + "10.1103/PhysRevB.56.11989 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The vortex contribution to the dc field (H) ... | \n", + "[{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '... | \n", + "2009-10-30 | \n", + "[[Willemsen, Balam A., , Physics Department,\\n... | \n", + "
2268250 | \n", + "supr-con/9609003 | \n", + "Hasegawa Yasumasa | \n", + "Yasumasa Hasegawa (Himeji Institute of Technol... | \n", + "Density of States and NMR Relaxation Rate in A... | \n", + "7 pages, 4 PostScript Figures, LaTeX, to appea... | \n", + "None | \n", + "10.1143/JPSJ.65.3131 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "We show that the density of states in an ani... | \n", + "[{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ... | \n", + "2009-10-30 | \n", + "[[Hasegawa, Yasumasa, , Himeji Institute of Te... | \n", + "
2268251 | \n", + "supr-con/9609004 | \n", + "Masanori Ichioka | \n", + "Naoki Enomoto, Masanori Ichioka and Kazushige ... | \n", + "Ginzburg Landau theory for d-wave pairing and ... | \n", + "12 pages including 8 eps figs, LaTeX with jpsj... | \n", + "J. Phys. Soc. Jpn. 66, 204 (1997). | \n", + "10.1143/JPSJ.66.204 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The Ginzburg Landau theory for d_{x^2-y^2}-w... | \n", + "[{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ... | \n", + "2009-10-30 | \n", + "[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... | \n", + "
2268252 rows × 14 columns
\n", + "\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "abstract | \n", + "versions | \n", + "update_date | \n", + "authors_parsed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0704.0001 | \n", + "Pavel Nadolsky | \n", + "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", + "Calculation of prompt diphoton production cros... | \n", + "37 pages, 15 figures; published version | \n", + "Phys.Rev.D76:013009,2007 | \n", + "10.1103/PhysRevD.76.013009 | \n", + "ANL-HEP-PR-07-12 | \n", + "hep-ph | \n", + "None | \n", + "A fully differential calculation in perturba... | \n", + "[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... | \n", + "2008-11-26 | \n", + "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", + "
1 | \n", + "0704.0002 | \n", + "Louis Theran | \n", + "Ileana Streinu and Louis Theran | \n", + "Sparsity-certifying Graph Decompositions | \n", + "To appear in Graphs and Combinatorics | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO cs.CG | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "We describe a new algorithm, the $(k,\\ell)$-... | \n", + "[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... | \n", + "2008-12-13 | \n", + "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", + "
2 | \n", + "0704.0003 | \n", + "Hongjun Pan | \n", + "Hongjun Pan | \n", + "The evolution of the Earth-Moon system based o... | \n", + "23 pages, 3 figures | \n", + "None | \n", + "None | \n", + "None | \n", + "physics.gen-ph | \n", + "None | \n", + "The evolution of Earth-Moon system is descri... | \n", + "[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... | \n", + "2008-01-13 | \n", + "[[Pan, Hongjun, ]] | \n", + "
3 | \n", + "0704.0004 | \n", + "David Callan | \n", + "David Callan | \n", + "A determinant of Stirling cycle numbers counts... | \n", + "11 pages | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO | \n", + "None | \n", + "We show that a determinant of Stirling cycle... | \n", + "[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... | \n", + "2007-05-23 | \n", + "[[Callan, David, ]] | \n", + "
4 | \n", + "0704.0005 | \n", + "Alberto Torchinsky | \n", + "Wael Abu-Shammala and Alberto Torchinsky | \n", + "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", + "None | \n", + "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", + "None | \n", + "None | \n", + "math.CA math.FA | \n", + "None | \n", + "In this paper we show how to compute the $\\L... | \n", + "[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... | \n", + "2013-10-15 | \n", + "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2268247 | \n", + "supr-con/9608008 | \n", + "Ruslan Prozorov | \n", + "R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... | \n", + "On the origin of the irreversibility line in t... | \n", + "19 pages, LaTex, 6 PostScript figures; Author'... | \n", + "None | \n", + "10.1103/PhysRevB.54.15530 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "We report on measurements of the angular dep... | \n", + "[{'created': 'Mon, 26 Aug 1996 15:08:35 GMT', ... | \n", + "2009-10-30 | \n", + "[[Prozorov, R., ], [Konczykowski, M., ], [Schm... | \n", + "
2268248 | \n", + "supr-con/9609001 | \n", + "Durga P. Choudhury | \n", + "Durga P. Choudhury, Balam A. Willemsen, John S... | \n", + "Nonlinear Response of HTSC Thin Film Microwave... | \n", + "4 pages, LaTeX type, Uses IEEE style files, 60... | \n", + "None | \n", + "10.1109/77.620744 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The non-linear microwave surface impedance o... | \n", + "[{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ... | \n", + "2016-11-18 | \n", + "[[Choudhury, Durga P., , Physics Department, N... | \n", + "
2268249 | \n", + "supr-con/9609002 | \n", + "Durga P. Choudhury | \n", + "Balam A. Willemsen, J. S. Derov and S.Sridhar ... | \n", + "Critical State Flux Penetration and Linear Mic... | \n", + "20 pages, LaTeX type, Uses REVTeX style files,... | \n", + "None | \n", + "10.1103/PhysRevB.56.11989 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The vortex contribution to the dc field (H) ... | \n", + "[{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '... | \n", + "2009-10-30 | \n", + "[[Willemsen, Balam A., , Physics Department,\\n... | \n", + "
2268250 | \n", + "supr-con/9609003 | \n", + "Hasegawa Yasumasa | \n", + "Yasumasa Hasegawa (Himeji Institute of Technol... | \n", + "Density of States and NMR Relaxation Rate in A... | \n", + "7 pages, 4 PostScript Figures, LaTeX, to appea... | \n", + "None | \n", + "10.1143/JPSJ.65.3131 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "We show that the density of states in an ani... | \n", + "[{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ... | \n", + "2009-10-30 | \n", + "[[Hasegawa, Yasumasa, , Himeji Institute of Te... | \n", + "
2268251 | \n", + "supr-con/9609004 | \n", + "Masanori Ichioka | \n", + "Naoki Enomoto, Masanori Ichioka and Kazushige ... | \n", + "Ginzburg Landau theory for d-wave pairing and ... | \n", + "12 pages including 8 eps figs, LaTeX with jpsj... | \n", + "J. Phys. Soc. Jpn. 66, 204 (1997). | \n", + "10.1143/JPSJ.66.204 | \n", + "None | \n", + "supr-con cond-mat.supr-con | \n", + "None | \n", + "The Ginzburg Landau theory for d_{x^2-y^2}-w... | \n", + "[{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ... | \n", + "2009-10-30 | \n", + "[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... | \n", + "
2268252 rows × 14 columns
\n", + "\n", + " | Null Count | \n", + "
---|---|
id | \n", + "0 | \n", + "
submitter | \n", + "15188 | \n", + "
authors | \n", + "0 | \n", + "
title | \n", + "0 | \n", + "
comments | \n", + "533142 | \n", + "
journal-ref | \n", + "1464835 | \n", + "
doi | \n", + "1146890 | \n", + "
report-no | \n", + "2092748 | \n", + "
categories | \n", + "0 | \n", + "
license | \n", + "452843 | \n", + "
abstract | \n", + "0 | \n", + "
versions | \n", + "0 | \n", + "
update_date | \n", + "0 | \n", + "
authors_parsed | \n", + "0 | \n", + "
\n", + " | Subject | \n", + "count_categories | \n", + "
---|---|---|
0 | \n", + "math | \n", + "809731 | \n", + "
1 | \n", + "cs | \n", + "762856 | \n", + "
2 | \n", + "cond-mat | \n", + "456374 | \n", + "
3 | \n", + "astro-ph | \n", + "385477 | \n", + "
4 | \n", + "physics | \n", + "282299 | \n", + "
5 | \n", + "hep-ph | \n", + "172043 | \n", + "
6 | \n", + "hep-th | \n", + "158585 | \n", + "
7 | \n", + "quant-ph | \n", + "135474 | \n", + "
8 | \n", + "stat | \n", + "128289 | \n", + "
9 | \n", + "gr-qc | \n", + "100320 | \n", + "
\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "abstract | \n", + "versions | \n", + "update_date | \n", + "authors_parsed | \n", + "len_abstract | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0704.0001 | \n", + "Pavel Nadolsky | \n", + "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", + "Calculation of prompt diphoton production cros... | \n", + "37 pages, 15 figures; published version | \n", + "Phys.Rev.D76:013009,2007 | \n", + "10.1103/PhysRevD.76.013009 | \n", + "ANL-HEP-PR-07-12 | \n", + "hep-ph | \n", + "None | \n", + "A fully differential calculation in perturba... | \n", + "[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... | \n", + "2008-11-26 | \n", + "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", + "983 | \n", + "
1 | \n", + "0704.0002 | \n", + "Louis Theran | \n", + "Ileana Streinu and Louis Theran | \n", + "Sparsity-certifying Graph Decompositions | \n", + "To appear in Graphs and Combinatorics | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO cs.CG | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "We describe a new algorithm, the $(k,\\ell)$-... | \n", + "[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... | \n", + "2008-12-13 | \n", + "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", + "798 | \n", + "
2 | \n", + "0704.0003 | \n", + "Hongjun Pan | \n", + "Hongjun Pan | \n", + "The evolution of the Earth-Moon system based o... | \n", + "23 pages, 3 figures | \n", + "None | \n", + "None | \n", + "None | \n", + "physics.gen-ph | \n", + "None | \n", + "The evolution of Earth-Moon system is descri... | \n", + "[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... | \n", + "2008-01-13 | \n", + "[[Pan, Hongjun, ]] | \n", + "880 | \n", + "
3 | \n", + "0704.0004 | \n", + "David Callan | \n", + "David Callan | \n", + "A determinant of Stirling cycle numbers counts... | \n", + "11 pages | \n", + "None | \n", + "None | \n", + "None | \n", + "math.CO | \n", + "None | \n", + "We show that a determinant of Stirling cycle... | \n", + "[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... | \n", + "2007-05-23 | \n", + "[[Callan, David, ]] | \n", + "248 | \n", + "
4 | \n", + "0704.0005 | \n", + "Alberto Torchinsky | \n", + "Wael Abu-Shammala and Alberto Torchinsky | \n", + "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", + "None | \n", + "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", + "None | \n", + "None | \n", + "math.CA math.FA | \n", + "None | \n", + "In this paper we show how to compute the $\\L... | \n", + "[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... | \n", + "2013-10-15 | \n", + "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", + "223 | \n", + "
\n", + " | len_abstract | \n", + "
---|---|
count | \n", + "2268252.00 | \n", + "
mean | \n", + "935.27 | \n", + "
std | \n", + "427.57 | \n", + "
min | \n", + "6.00 | \n", + "
25% | \n", + "605.00 | \n", + "
50% | \n", + "894.00 | \n", + "
75% | \n", + "1233.00 | \n", + "
max | \n", + "6091.00 | \n", + "
\n", + " | month_year | \n", + "count | \n", + "
---|---|---|
0 | \n", + "2007 | \n", + "145936 | \n", + "
1 | \n", + "2008 | \n", + "69357 | \n", + "
2 | \n", + "2009 | \n", + "229079 | \n", + "
3 | \n", + "2010 | \n", + "56229 | \n", + "
4 | \n", + "2011 | \n", + "61870 | \n", + "
5 | \n", + "2012 | \n", + "57324 | \n", + "
6 | \n", + "2013 | \n", + "72037 | \n", + "
7 | \n", + "2014 | \n", + "88141 | \n", + "
8 | \n", + "2015 | \n", + "215153 | \n", + "
9 | \n", + "2016 | \n", + "133658 | \n", + "
10 | \n", + "2017 | \n", + "122603 | \n", + "
11 | \n", + "2018 | \n", + "136389 | \n", + "
12 | \n", + "2019 | \n", + "156051 | \n", + "
13 | \n", + "2020 | \n", + "178041 | \n", + "
14 | \n", + "2021 | \n", + "194847 | \n", + "
15 | \n", + "2022 | \n", + "220524 | \n", + "
16 | \n", + "2023 | \n", + "131013 | \n", + "
\n", + " | Subject | \n", + "Count_papers | \n", + "
---|---|---|
0 | \n", + "math | \n", + "809731 | \n", + "
1 | \n", + "cs | \n", + "762856 | \n", + "
2 | \n", + "cond-mat | \n", + "456374 | \n", + "
3 | \n", + "astro-ph | \n", + "385477 | \n", + "
4 | \n", + "physics | \n", + "282299 | \n", + "