lewispons commited on
Commit
2d4243e
·
1 Parent(s): 31c0b58

Initial Setup

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +12 -4
  2. models/GrammarGuru/.DS_Store +0 -0
  3. models/GrammarGuru/data/GrammarGuru.parquet.gzip +3 -0
  4. models/GrammarGuru/dictionaries/GrammarGuru.dict +3 -0
  5. models/GrammarGuru/similarities_matrix/GrammarGuru +3 -0
  6. models/GrammarGuru/similarities_matrix/GrammarGuru.index.data.npy +3 -0
  7. models/GrammarGuru/similarities_matrix/GrammarGuru.index.indices.npy +3 -0
  8. models/GrammarGuru/similarities_matrix/GrammarGuru.index.indptr.npy +3 -0
  9. models/GrammarGuru/similarities_matrix/GrammarGuru.index.npy +3 -0
  10. models/GrammarGuru/tdidf/GrammarGuru.model +3 -0
  11. notebooks/.DS_Store +0 -0
  12. notebooks/.gitkeep +0 -0
  13. notebooks/eda.ipynb +0 -0
  14. notebooks/feature-engineering.ipynb +990 -0
  15. notebooks/nlp_cleansing.ipynb +388 -0
  16. notebooks/nlp_eda.ipynb +0 -0
  17. notebooks/read_raw_data.ipynb +875 -0
  18. reports/.gitkeep +0 -0
  19. reports/Visualizations/DisciplinasMasPopulares.png +0 -0
  20. reports/Visualizations/DisciplinasMenosPopulares.png +0 -0
  21. reports/Visualizations/FrequenciaPalabras.png +0 -0
  22. reports/Visualizations/FrequenciaPalabrasMenosFreq.png +0 -0
  23. reports/Visualizations/HsitogramAbstractsLen.png +0 -0
  24. reports/Visualizations/Papes_release_year_by_Computer Science.png +0 -0
  25. reports/Visualizations/Papes_release_year_by_Economics.png +0 -0
  26. Systems Science.png +0 -0
  27. reports/Visualizations/Papes_release_year_by_Mathematics.png +0 -0
  28. reports/Visualizations/Papes_release_year_by_Physics.png +0 -0
  29. reports/Visualizations/Papes_release_year_by_Quantitative Biology.png +0 -0
  30. reports/Visualizations/Papes_release_year_by_Quantitative Finance.png +0 -0
  31. reports/Visualizations/Papes_release_year_by_Statistics.png +0 -0
  32. reports/Visualizations/PublicacionPapersAnno.png +0 -0
  33. reports/Visualizations/TopCatsPapers.png +0 -0
  34. reports/figures/.gitkeep +0 -0
  35. reports/figures/arxiv-logo.jpg +0 -0
  36. reports/figures/profile.jpeg +0 -0
  37. requirements.txt +81 -0
  38. src/app.py +226 -0
  39. src/data/.gitkeep +0 -0
  40. src/data/__init__.py +0 -0
  41. src/data/make_dataset.py +30 -0
  42. src/data/transform_raw_data.py +53 -0
  43. src/features/.gitkeep +0 -0
  44. src/features/__init__.py +0 -0
  45. src/features/build_features.py +0 -0
  46. src/models/.gitkeep +0 -0
  47. src/models/__init__.py +0 -0
  48. src/models/__pycache__/__init__.cpython-39.pyc +0 -0
  49. src/models/configs.yaml +18 -0
  50. src/models/gensim_tfidf.ipynb +862 -0
.gitattributes CHANGED
@@ -7,7 +7,7 @@
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
@@ -23,8 +23,8 @@
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
@@ -32,4 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ .lfs. filter=lfs diff=lfs merge=lfs -text
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/*/ filter=lfs diff=lfs merge=lfs -text
27
+ .tar. filter=lfs diff=lfs merge=lfs -text
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ tfevents filter=lfs diff=lfs merge=lfs -text
36
+ models/GrammarGuru/data/GrammarGuru.parquet.gzip filter=lfs diff=lfs merge=lfs -text
37
+ models/GrammarGuru/dictionaries/GrammarGuru.dict filter=lfs diff=lfs merge=lfs -text
38
+ models/GrammarGuru/similarities_matrix/GrammarGuru filter=lfs diff=lfs merge=lfs -text
39
+ models/GrammarGuru/similarities_matrix/GrammarGuru.index.data.npy filter=lfs diff=lfs merge=lfs -text
40
+ models/GrammarGuru/similarities_matrix/GrammarGuru.index.indices.npy filter=lfs diff=lfs merge=lfs -text
41
+ models/GrammarGuru/similarities_matrix/GrammarGuru.index.indptr.npy filter=lfs diff=lfs merge=lfs -text
42
+ models/GrammarGuru/similarities_matrix/GrammarGuru.index.npy filter=lfs diff=lfs merge=lfs -text
43
+ models/GrammarGuru/tdidf/GrammarGuru.model filter=lfs diff=lfs merge=lfs -text
models/GrammarGuru/.DS_Store ADDED
Binary file (8.2 kB). View file
 
models/GrammarGuru/data/GrammarGuru.parquet.gzip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d41d0d50492fc4f8f470aa5106359de11d5625d5dc1a502f04c26fc3cd472f5
3
+ size 208673276
models/GrammarGuru/dictionaries/GrammarGuru.dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e1cf56046f5325c87d586761a6f9beea67599a5dba6f6bf7811b3d605e9838
3
+ size 16358261
models/GrammarGuru/similarities_matrix/GrammarGuru ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cb64d81faf4009561e0c0dd1861f60707aec87fd48be2147268154372d825a
3
+ size 615
models/GrammarGuru/similarities_matrix/GrammarGuru.index.data.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ee886caeb86b6afe13a87a3d86992fcdb76780a01a96264eba1d02cf7fbb0b
3
+ size 62474552
models/GrammarGuru/similarities_matrix/GrammarGuru.index.indices.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a76c3a2ff5382688106ca638dbc5b5b7752a48069c8d199f8f260fd6b13e29fc
3
+ size 62474552
models/GrammarGuru/similarities_matrix/GrammarGuru.index.indptr.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2189acf62052757ccfc57da9ff6a4910ea7ce9033f8be59cdb3623c67c2ef16
3
+ size 1135612
models/GrammarGuru/similarities_matrix/GrammarGuru.index.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fbdc8884bdc666b66b7c3aab8e6d2866ebbbeba041310a57b9cf9a979a3108
3
+ size 118
models/GrammarGuru/tdidf/GrammarGuru.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78e4051d387687adc37cf59c2498829ffde41aef34f56d65e21c434deae6bf1c
3
+ size 15251885
notebooks/.DS_Store ADDED
Binary file (6.15 kB). View file
 
notebooks/.gitkeep ADDED
File without changes
notebooks/eda.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/feature-engineering.ipynb ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 18,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "from matplotlib import pyplot as plt \n",
11
+ "import seaborn as sns\n",
12
+ "\n",
13
+ "pd.set_option('display.float_format', '{:.2f}'.format)\n",
14
+ "sns.set_style(\"darkgrid\")\n",
15
+ "sns.set_palette(\"mako\")"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "df_cleaned = pd.read_parquet(\"../data/processed/arxiv_papers.parquet.gzip\")"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 3,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "data": {
34
+ "text/html": [
35
+ "<div>\n",
36
+ "<style scoped>\n",
37
+ " .dataframe tbody tr th:only-of-type {\n",
38
+ " vertical-align: middle;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe tbody tr th {\n",
42
+ " vertical-align: top;\n",
43
+ " }\n",
44
+ "\n",
45
+ " .dataframe thead th {\n",
46
+ " text-align: right;\n",
47
+ " }\n",
48
+ "</style>\n",
49
+ "<table border=\"1\" class=\"dataframe\">\n",
50
+ " <thead>\n",
51
+ " <tr style=\"text-align: right;\">\n",
52
+ " <th></th>\n",
53
+ " <th>id</th>\n",
54
+ " <th>submitter</th>\n",
55
+ " <th>authors</th>\n",
56
+ " <th>title</th>\n",
57
+ " <th>comments</th>\n",
58
+ " <th>journal-ref</th>\n",
59
+ " <th>doi</th>\n",
60
+ " <th>report-no</th>\n",
61
+ " <th>categories</th>\n",
62
+ " <th>license</th>\n",
63
+ " <th>abstract</th>\n",
64
+ " <th>versions</th>\n",
65
+ " <th>update_date</th>\n",
66
+ " <th>authors_parsed</th>\n",
67
+ " <th>cleaned_abstracts</th>\n",
68
+ " </tr>\n",
69
+ " </thead>\n",
70
+ " <tbody>\n",
71
+ " <tr>\n",
72
+ " <th>0</th>\n",
73
+ " <td>0704.0001</td>\n",
74
+ " <td>Pavel Nadolsky</td>\n",
75
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
76
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
77
+ " <td>37 pages, 15 figures; published version</td>\n",
78
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
79
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
80
+ " <td>ANL-HEP-PR-07-12</td>\n",
81
+ " <td>hep-ph</td>\n",
82
+ " <td>None</td>\n",
83
+ " <td>A fully differential calculation in perturba...</td>\n",
84
+ " <td>[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '...</td>\n",
85
+ " <td>2008-11-26</td>\n",
86
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
87
+ " <td>fully differential calculation perturbative...</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>1</th>\n",
91
+ " <td>0704.0002</td>\n",
92
+ " <td>Louis Theran</td>\n",
93
+ " <td>Ileana Streinu and Louis Theran</td>\n",
94
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
95
+ " <td>To appear in Graphs and Combinatorics</td>\n",
96
+ " <td>None</td>\n",
97
+ " <td>None</td>\n",
98
+ " <td>None</td>\n",
99
+ " <td>math.CO cs.CG</td>\n",
100
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
101
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
102
+ " <td>[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ...</td>\n",
103
+ " <td>2008-12-13</td>\n",
104
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
105
+ " <td>describe new algorithm $ k,\\ell)$-pebble ga...</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>2</th>\n",
109
+ " <td>0704.0003</td>\n",
110
+ " <td>Hongjun Pan</td>\n",
111
+ " <td>Hongjun Pan</td>\n",
112
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
113
+ " <td>23 pages, 3 figures</td>\n",
114
+ " <td>None</td>\n",
115
+ " <td>None</td>\n",
116
+ " <td>None</td>\n",
117
+ " <td>physics.gen-ph</td>\n",
118
+ " <td>None</td>\n",
119
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
120
+ " <td>[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '...</td>\n",
121
+ " <td>2008-01-13</td>\n",
122
+ " <td>[[Pan, Hongjun, ]]</td>\n",
123
+ " <td>evolution earth moon system describe dark m...</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>3</th>\n",
127
+ " <td>0704.0004</td>\n",
128
+ " <td>David Callan</td>\n",
129
+ " <td>David Callan</td>\n",
130
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
131
+ " <td>11 pages</td>\n",
132
+ " <td>None</td>\n",
133
+ " <td>None</td>\n",
134
+ " <td>None</td>\n",
135
+ " <td>math.CO</td>\n",
136
+ " <td>None</td>\n",
137
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
138
+ " <td>[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ...</td>\n",
139
+ " <td>2007-05-23</td>\n",
140
+ " <td>[[Callan, David, ]]</td>\n",
141
+ " <td>determinant stirling cycle number count unl...</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>4</th>\n",
145
+ " <td>0704.0005</td>\n",
146
+ " <td>Alberto Torchinsky</td>\n",
147
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
148
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
149
+ " <td>None</td>\n",
150
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
151
+ " <td>None</td>\n",
152
+ " <td>None</td>\n",
153
+ " <td>math.CA math.FA</td>\n",
154
+ " <td>None</td>\n",
155
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
156
+ " <td>[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '...</td>\n",
157
+ " <td>2013-10-15</td>\n",
158
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
159
+ " <td>paper compute $ \\lambda_{\\alpha}$ norm $ \\a...</td>\n",
160
+ " </tr>\n",
161
+ " <tr>\n",
162
+ " <th>...</th>\n",
163
+ " <td>...</td>\n",
164
+ " <td>...</td>\n",
165
+ " <td>...</td>\n",
166
+ " <td>...</td>\n",
167
+ " <td>...</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>...</td>\n",
170
+ " <td>...</td>\n",
171
+ " <td>...</td>\n",
172
+ " <td>...</td>\n",
173
+ " <td>...</td>\n",
174
+ " <td>...</td>\n",
175
+ " <td>...</td>\n",
176
+ " <td>...</td>\n",
177
+ " <td>...</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>2268247</th>\n",
181
+ " <td>supr-con/9608008</td>\n",
182
+ " <td>Ruslan Prozorov</td>\n",
183
+ " <td>R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...</td>\n",
184
+ " <td>On the origin of the irreversibility line in t...</td>\n",
185
+ " <td>19 pages, LaTex, 6 PostScript figures; Author'...</td>\n",
186
+ " <td>None</td>\n",
187
+ " <td>10.1103/PhysRevB.54.15530</td>\n",
188
+ " <td>None</td>\n",
189
+ " <td>supr-con cond-mat.supr-con</td>\n",
190
+ " <td>None</td>\n",
191
+ " <td>We report on measurements of the angular dep...</td>\n",
192
+ " <td>[{'created': 'Mon, 26 Aug 1996 15:08:35 GMT', ...</td>\n",
193
+ " <td>2009-10-30</td>\n",
194
+ " <td>[[Prozorov, R., ], [Konczykowski, M., ], [Schm...</td>\n",
195
+ " <td>report measurement angular dependence irrev...</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>2268248</th>\n",
199
+ " <td>supr-con/9609001</td>\n",
200
+ " <td>Durga P. Choudhury</td>\n",
201
+ " <td>Durga P. Choudhury, Balam A. Willemsen, John S...</td>\n",
202
+ " <td>Nonlinear Response of HTSC Thin Film Microwave...</td>\n",
203
+ " <td>4 pages, LaTeX type, Uses IEEE style files, 60...</td>\n",
204
+ " <td>None</td>\n",
205
+ " <td>10.1109/77.620744</td>\n",
206
+ " <td>None</td>\n",
207
+ " <td>supr-con cond-mat.supr-con</td>\n",
208
+ " <td>None</td>\n",
209
+ " <td>The non-linear microwave surface impedance o...</td>\n",
210
+ " <td>[{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ...</td>\n",
211
+ " <td>2016-11-18</td>\n",
212
+ " <td>[[Choudhury, Durga P., , Physics Department, N...</td>\n",
213
+ " <td>non linear microwave surface impedance patt...</td>\n",
214
+ " </tr>\n",
215
+ " <tr>\n",
216
+ " <th>2268249</th>\n",
217
+ " <td>supr-con/9609002</td>\n",
218
+ " <td>Durga P. Choudhury</td>\n",
219
+ " <td>Balam A. Willemsen, J. S. Derov and S.Sridhar ...</td>\n",
220
+ " <td>Critical State Flux Penetration and Linear Mic...</td>\n",
221
+ " <td>20 pages, LaTeX type, Uses REVTeX style files,...</td>\n",
222
+ " <td>None</td>\n",
223
+ " <td>10.1103/PhysRevB.56.11989</td>\n",
224
+ " <td>None</td>\n",
225
+ " <td>supr-con cond-mat.supr-con</td>\n",
226
+ " <td>None</td>\n",
227
+ " <td>The vortex contribution to the dc field (H) ...</td>\n",
228
+ " <td>[{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '...</td>\n",
229
+ " <td>2009-10-30</td>\n",
230
+ " <td>[[Willemsen, Balam A., , Physics Department,\\n...</td>\n",
231
+ " <td>vortex contribution dc field h dependent mi...</td>\n",
232
+ " </tr>\n",
233
+ " <tr>\n",
234
+ " <th>2268250</th>\n",
235
+ " <td>supr-con/9609003</td>\n",
236
+ " <td>Hasegawa Yasumasa</td>\n",
237
+ " <td>Yasumasa Hasegawa (Himeji Institute of Technol...</td>\n",
238
+ " <td>Density of States and NMR Relaxation Rate in A...</td>\n",
239
+ " <td>7 pages, 4 PostScript Figures, LaTeX, to appea...</td>\n",
240
+ " <td>None</td>\n",
241
+ " <td>10.1143/JPSJ.65.3131</td>\n",
242
+ " <td>None</td>\n",
243
+ " <td>supr-con cond-mat.supr-con</td>\n",
244
+ " <td>None</td>\n",
245
+ " <td>We show that the density of states in an ani...</td>\n",
246
+ " <td>[{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ...</td>\n",
247
+ " <td>2009-10-30</td>\n",
248
+ " <td>[[Hasegawa, Yasumasa, , Himeji Institute of Te...</td>\n",
249
+ " <td>density state anisotropic superconductor \\n...</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>2268251</th>\n",
253
+ " <td>supr-con/9609004</td>\n",
254
+ " <td>Masanori Ichioka</td>\n",
255
+ " <td>Naoki Enomoto, Masanori Ichioka and Kazushige ...</td>\n",
256
+ " <td>Ginzburg Landau theory for d-wave pairing and ...</td>\n",
257
+ " <td>12 pages including 8 eps figs, LaTeX with jpsj...</td>\n",
258
+ " <td>J. Phys. Soc. Jpn. 66, 204 (1997).</td>\n",
259
+ " <td>10.1143/JPSJ.66.204</td>\n",
260
+ " <td>None</td>\n",
261
+ " <td>supr-con cond-mat.supr-con</td>\n",
262
+ " <td>None</td>\n",
263
+ " <td>The Ginzburg Landau theory for d_{x^2-y^2}-w...</td>\n",
264
+ " <td>[{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ...</td>\n",
265
+ " <td>2009-10-30</td>\n",
266
+ " <td>[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ...</td>\n",
267
+ " <td>ginzburg landau theory d_{x^2 y^2}-wave sup...</td>\n",
268
+ " </tr>\n",
269
+ " </tbody>\n",
270
+ "</table>\n",
271
+ "<p>2268252 rows × 15 columns</p>\n",
272
+ "</div>"
273
+ ],
274
+ "text/plain": [
275
+ " id submitter \n",
276
+ "0 0704.0001 Pavel Nadolsky \\\n",
277
+ "1 0704.0002 Louis Theran \n",
278
+ "2 0704.0003 Hongjun Pan \n",
279
+ "3 0704.0004 David Callan \n",
280
+ "4 0704.0005 Alberto Torchinsky \n",
281
+ "... ... ... \n",
282
+ "2268247 supr-con/9608008 Ruslan Prozorov \n",
283
+ "2268248 supr-con/9609001 Durga P. Choudhury \n",
284
+ "2268249 supr-con/9609002 Durga P. Choudhury \n",
285
+ "2268250 supr-con/9609003 Hasegawa Yasumasa \n",
286
+ "2268251 supr-con/9609004 Masanori Ichioka \n",
287
+ "\n",
288
+ " authors \n",
289
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n",
290
+ "1 Ileana Streinu and Louis Theran \n",
291
+ "2 Hongjun Pan \n",
292
+ "3 David Callan \n",
293
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
294
+ "... ... \n",
295
+ "2268247 R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... \n",
296
+ "2268248 Durga P. Choudhury, Balam A. Willemsen, John S... \n",
297
+ "2268249 Balam A. Willemsen, J. S. Derov and S.Sridhar ... \n",
298
+ "2268250 Yasumasa Hasegawa (Himeji Institute of Technol... \n",
299
+ "2268251 Naoki Enomoto, Masanori Ichioka and Kazushige ... \n",
300
+ "\n",
301
+ " title \n",
302
+ "0 Calculation of prompt diphoton production cros... \\\n",
303
+ "1 Sparsity-certifying Graph Decompositions \n",
304
+ "2 The evolution of the Earth-Moon system based o... \n",
305
+ "3 A determinant of Stirling cycle numbers counts... \n",
306
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
307
+ "... ... \n",
308
+ "2268247 On the origin of the irreversibility line in t... \n",
309
+ "2268248 Nonlinear Response of HTSC Thin Film Microwave... \n",
310
+ "2268249 Critical State Flux Penetration and Linear Mic... \n",
311
+ "2268250 Density of States and NMR Relaxation Rate in A... \n",
312
+ "2268251 Ginzburg Landau theory for d-wave pairing and ... \n",
313
+ "\n",
314
+ " comments \n",
315
+ "0 37 pages, 15 figures; published version \\\n",
316
+ "1 To appear in Graphs and Combinatorics \n",
317
+ "2 23 pages, 3 figures \n",
318
+ "3 11 pages \n",
319
+ "4 None \n",
320
+ "... ... \n",
321
+ "2268247 19 pages, LaTex, 6 PostScript figures; Author'... \n",
322
+ "2268248 4 pages, LaTeX type, Uses IEEE style files, 60... \n",
323
+ "2268249 20 pages, LaTeX type, Uses REVTeX style files,... \n",
324
+ "2268250 7 pages, 4 PostScript Figures, LaTeX, to appea... \n",
325
+ "2268251 12 pages including 8 eps figs, LaTeX with jpsj... \n",
326
+ "\n",
327
+ " journal-ref \n",
328
+ "0 Phys.Rev.D76:013009,2007 \\\n",
329
+ "1 None \n",
330
+ "2 None \n",
331
+ "3 None \n",
332
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 \n",
333
+ "... ... \n",
334
+ "2268247 None \n",
335
+ "2268248 None \n",
336
+ "2268249 None \n",
337
+ "2268250 None \n",
338
+ "2268251 J. Phys. Soc. Jpn. 66, 204 (1997). \n",
339
+ "\n",
340
+ " doi report-no \n",
341
+ "0 10.1103/PhysRevD.76.013009 ANL-HEP-PR-07-12 \\\n",
342
+ "1 None None \n",
343
+ "2 None None \n",
344
+ "3 None None \n",
345
+ "4 None None \n",
346
+ "... ... ... \n",
347
+ "2268247 10.1103/PhysRevB.54.15530 None \n",
348
+ "2268248 10.1109/77.620744 None \n",
349
+ "2268249 10.1103/PhysRevB.56.11989 None \n",
350
+ "2268250 10.1143/JPSJ.65.3131 None \n",
351
+ "2268251 10.1143/JPSJ.66.204 None \n",
352
+ "\n",
353
+ " categories \n",
354
+ "0 hep-ph \\\n",
355
+ "1 math.CO cs.CG \n",
356
+ "2 physics.gen-ph \n",
357
+ "3 math.CO \n",
358
+ "4 math.CA math.FA \n",
359
+ "... ... \n",
360
+ "2268247 supr-con cond-mat.supr-con \n",
361
+ "2268248 supr-con cond-mat.supr-con \n",
362
+ "2268249 supr-con cond-mat.supr-con \n",
363
+ "2268250 supr-con cond-mat.supr-con \n",
364
+ "2268251 supr-con cond-mat.supr-con \n",
365
+ "\n",
366
+ " license \n",
367
+ "0 None \\\n",
368
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
369
+ "2 None \n",
370
+ "3 None \n",
371
+ "4 None \n",
372
+ "... ... \n",
373
+ "2268247 None \n",
374
+ "2268248 None \n",
375
+ "2268249 None \n",
376
+ "2268250 None \n",
377
+ "2268251 None \n",
378
+ "\n",
379
+ " abstract \n",
380
+ "0 A fully differential calculation in perturba... \\\n",
381
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
382
+ "2 The evolution of Earth-Moon system is descri... \n",
383
+ "3 We show that a determinant of Stirling cycle... \n",
384
+ "4 In this paper we show how to compute the $\\L... \n",
385
+ "... ... \n",
386
+ "2268247 We report on measurements of the angular dep... \n",
387
+ "2268248 The non-linear microwave surface impedance o... \n",
388
+ "2268249 The vortex contribution to the dc field (H) ... \n",
389
+ "2268250 We show that the density of states in an ani... \n",
390
+ "2268251 The Ginzburg Landau theory for d_{x^2-y^2}-w... \n",
391
+ "\n",
392
+ " versions update_date \n",
393
+ "0 [{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... 2008-11-26 \\\n",
394
+ "1 [{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... 2008-12-13 \n",
395
+ "2 [{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... 2008-01-13 \n",
396
+ "3 [{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... 2007-05-23 \n",
397
+ "4 [{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... 2013-10-15 \n",
398
+ "... ... ... \n",
399
+ "2268247 [{'created': 'Mon, 26 Aug 1996 15:08:35 GMT', ... 2009-10-30 \n",
400
+ "2268248 [{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ... 2016-11-18 \n",
401
+ "2268249 [{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '... 2009-10-30 \n",
402
+ "2268250 [{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ... 2009-10-30 \n",
403
+ "2268251 [{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ... 2009-10-30 \n",
404
+ "\n",
405
+ " authors_parsed \n",
406
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \\\n",
407
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
408
+ "2 [[Pan, Hongjun, ]] \n",
409
+ "3 [[Callan, David, ]] \n",
410
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] \n",
411
+ "... ... \n",
412
+ "2268247 [[Prozorov, R., ], [Konczykowski, M., ], [Schm... \n",
413
+ "2268248 [[Choudhury, Durga P., , Physics Department, N... \n",
414
+ "2268249 [[Willemsen, Balam A., , Physics Department,\\n... \n",
415
+ "2268250 [[Hasegawa, Yasumasa, , Himeji Institute of Te... \n",
416
+ "2268251 [[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... \n",
417
+ "\n",
418
+ " cleaned_abstracts \n",
419
+ "0 fully differential calculation perturbative... \n",
420
+ "1 describe new algorithm $ k,\\ell)$-pebble ga... \n",
421
+ "2 evolution earth moon system describe dark m... \n",
422
+ "3 determinant stirling cycle number count unl... \n",
423
+ "4 paper compute $ \\lambda_{\\alpha}$ norm $ \\a... \n",
424
+ "... ... \n",
425
+ "2268247 report measurement angular dependence irrev... \n",
426
+ "2268248 non linear microwave surface impedance patt... \n",
427
+ "2268249 vortex contribution dc field h dependent mi... \n",
428
+ "2268250 density state anisotropic superconductor \\n... \n",
429
+ "2268251 ginzburg landau theory d_{x^2 y^2}-wave sup... \n",
430
+ "\n",
431
+ "[2268252 rows x 15 columns]"
432
+ ]
433
+ },
434
+ "execution_count": 3,
435
+ "metadata": {},
436
+ "output_type": "execute_result"
437
+ }
438
+ ],
439
+ "source": [
440
+ "df_cleaned"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "code",
445
+ "execution_count": 4,
446
+ "metadata": {},
447
+ "outputs": [],
448
+ "source": [
449
+ "df_cleaned['len_abstract'] = df_cleaned['cleaned_abstracts'].str.len()"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": 8,
455
+ "metadata": {},
456
+ "outputs": [
457
+ {
458
+ "data": {
459
+ "text/plain": [
460
+ "count 2268252.00\n",
461
+ "mean 688.92\n",
462
+ "std 315.74\n",
463
+ "min 4.00\n",
464
+ "25% 446.00\n",
465
+ "50% 658.00\n",
466
+ "75% 908.00\n",
467
+ "max 4372.00\n",
468
+ "Name: len_abstract, dtype: float64"
469
+ ]
470
+ },
471
+ "execution_count": 8,
472
+ "metadata": {},
473
+ "output_type": "execute_result"
474
+ }
475
+ ],
476
+ "source": [
477
+ "df_cleaned['len_abstract'].describe()"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": 19,
483
+ "metadata": {},
484
+ "outputs": [
485
+ {
486
+ "data": {
487
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAGxCAYAAACZa0njAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABInklEQVR4nO3de3hU1b3/8c+QSUIQYjQJkYhHQeUiQhISgdZQgQIiYItIPZUqIHi0lUt75KIBBSRcJFxUBBTkXlABsXhEqxWPtrWVwIkmCJTKTQwBQoJcJclkkvX7A7N/zCRI7rOTeb+eJ09m9ppZs9Z8J/Jx77X3OIwxRgAAALA08PUAAAAA7IaABAAA4IWABAAA4IWABAAA4IWABAAA4IWABAAA4IWABAAA4IWABAAA4IWABAAA4IWABAAA4MXp6wHUZSdPnlN1flGLwyGFhzep9n5RddTG3qiPfVEb+/LH2pTMuTwISFVgjGrkQ1VT/aLqqI29UR/7ojb2RW3KxiE2AAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQkAAAALwQk2Mr0pJm+HgIAAAQk2IvLVejrIQAAQEACAADwRkACAADwQkACAADwQkACAADw4tOAlJ2drTFjxqhTp07q2rWrZs2apYKCAklSZmamhg0bptjYWPXt21efffaZx3P/+c9/qn///oqJidGQIUOUmZnp0b5q1Sp17dpVcXFxmjhxovLy8qy2goICTZw4UQkJCUpMTNSKFStqfrIAAKDO8FlAMsZozJgxysvL07p16/TCCy/ok08+0YsvvihjjEaOHKmIiAht2rRJv/zlLzVq1CgdPXpUknT06FGNHDlSAwcO1FtvvaVrr71WTzzxhIwxkqQPP/xQCxcu1LRp07R69WplZGRozpw51munpKRo165dWr16taZMmaKFCxfqgw8+8Mn7AAAA7Mfpqxc+ePCg0tPT9Y9//EMRERGSpDFjxmj27Nn62c9+pszMTL355ptq1KiRbr75Zn3++efatGmTRo8erY0bN+r222/X8OHDJUmzZs3SnXfeqe3bt6tz585as2aNhg4dqu7du0uSnnvuOY0YMULjx4+XMUYbN27Ua6+9pnbt2qldu3bat2+f1q1bpz59+vjq7QAAADbisz1IkZGRWrZsmRWOSpw/f14ZGRm67bbb1KhRI2t7fHy80tPTJUkZGRlKSEiw2kJCQtSuXTulp6erqKhIX331lUd7bGysCgsLtXfvXu3du1dut1txcXEefWdkZKi4uLiGZgsAAOoSn+1BCg0NVdeuXa37xcXFWrt2rbp06aKcnBw1bdrU4/Hh4eE6fvy4JP1o+9mzZ1VQUODR7nQ6FRYWpuPHj6tBgwa65pprFBQUZLVHRESooKBAp0+f1rXXXlsT0wUAAHWIzwKStzlz5mjPnj166623tGrVKo8AI0lBQUFyuVySpLy8vMu25+fnW/fLajfGlNkmyeq/vByOCj283P1Vd791jR3nT23sjfrYF7WxL3+sTUXmaouANGfOHK1evVovvPCCWrVqpeDgYJ0+fdrjMS6XSw0bNpQkBQcHlwozLpdLoaGhCg4Otu57t4eEhKioqKjMNklW/+UVHt6kQo/3db91QaAzQBER9p2/P9emLqA+9kVt7IvalM3nASk5OVlvvPGG5syZo7vvvluSFBUVpf3793s8Ljc31zpsFhUVpdzc3FLtbdu2VVhYmIKDg5Wbm6ubb75ZkuR2u3X69GlFRkbKGKNTp07J7XbL6bw4/ZycHDVs2FChoaEVGvvJk+f0w4lz1cLhuPhBre5+65JCd5Fyc8/5ehilUBt7oz72RW3syx9rUzLn8vBpQFq4cKHefPNNzZ8/3+MMspiYGC1dulT5+fnWXp20tDTFx8db7Wlpadbj8/LytGfPHo0aNUoNGjRQ+/btlZaWps6dO0uS0tPT5XQ61aZNG0kX1ySlp6dbC7nT0tLUvn17NWhQsTXrxqhGPlQ11W9dYee5+3tt7I762Be1sS9qUzafncV24MABLV68WP/1X/+l+Ph45eTkWD+dOnVSs2bNlJSUpH379mnp0qXauXOnBg0aJEm6//779cUXX2jp0qXat2+fkpKS1Lx5cysQDR48WMuXL9fWrVu1c+dOTZ06VQ888IBCQkIUEhKiAQMGaOrUqdq5c6e2bt2qFStWaMiQIb56KwAAgM34bA/Sxx9/rKKiIr3yyit65ZVXPNr+/e9/a/HixZo0aZIGDhyoG2+8UYsWLVJ0dLQkqXnz5nr55Zc1c+ZMLVq0SHFxcVq0aJEcP6y+6tevn7KysjR58mS5XC717t1b48ePt/pPSkrS1KlTNXToUDVu3FijR49W7969a2/yAADA1hzGsGOtsnJzq38NUkREk2rvty6ZPPY5TZs3xdfDKIXa2Bv1sS9qY1/+WJuSOZcHX1YLAADghYAEAADghYAEAADghYAEAADghYAEAADghYAEAADghYAEAADghYAEAADghYCEGjN7coqmJ8309TAAAKgwAhKq1aWhyOUqlMtV6NFOYAIA1AUEJFSrskKRJI/QBACA3RGQUKMCAy9+HzLBCABQlxCQUCsCA52aPTnF18MAAKBcCEioNZfuRWItEgDAzghIqBaXBp7AQKfmJc+3bpfsOSo53CZJxpjaHSAAABVAQEK18F5jVHjJ/bLWHwUGOtmLBACwLQISatWle5RYuA0AsCsCEqrFpYfVrqQkGJWEJfYkAQDsxnnlhwDlU1iJPUIuV6EKC901MBoAACqPPUgAAABeCEjwqUvPbAMAwC4ISPA5zmgDANgNAQm2wBltAAA7ISABAAB4ISDBFkouE8Cp/wAAO2CFLKpk9uQUOatpoXXJZQI49R8A4GvsQUKVuFyFlbr+UXmxJwkA4AsEJNhSo0YNNXtyCou3AQA+QUCCbRGOAAC+QkBCpZV86SwAAPUNAQmVxh4eAEB9ZYuA5HK51L9/f6WmpkqSnn76abVu3brUz5AhQ6znJCQklGr//vvvJUkFBQWaOHGiEhISlJiYqBUrVni8XmZmpoYNG6bY2Fj17dtXn332We1NFgAA2J7PT/MvKCjQ2LFjtW/fPmvbpEmTNHbsWOt+VlaWHn74YSsgZWdn69y5c9q6dasaNmxoPa5Ro0aSpJSUFO3atUurV6/W0aNH9dRTTyk6Olp9+vSRMUYjR45Uq1attGnTJm3dulWjRo3S+++/r+jo6FqaNcqrUaOGmp40U8/MmujroQAA/IhPA9L+/fs1duxYGWM8tjdp0kRNmjSx7j/99NPq06ePevbsKUk6cOCAIiMjdcMNN5Tq88KFC9q4caNee+01tWvXTu3atdO+ffu0bt069enTR9u2bVNmZqbefPNNNWrUSDfffLM+//xzbdq0SaNHj67ZCaNSjDGEJABArfLpIbbt27erc+fOWr9+/WUf8/nnn2vHjh168sknrW379+9XixYtynz83r175Xa7FRcXZ22Lj49XRkaGiouLlZGRodtuu83a21TSnp6eXvUJocaUhCQAAGqDT/cgDR48+IqPWbp0qe677z41a9bM2nbgwAHl5eXp4Ycf1qFDh9S2bVtNnDhRLVq0UE5Ojq655hoFBQVZj4+IiFBBQYFOnz6tnJwcNW3a1OM1wsPDdfz48QqP3+Go8FPK1V9191tfGGN89t5QG3ujPvZFbezLH2tTkbn6fA3Sj8nMzNS2bds0adIkj+0HDx7UmTNn9OSTT6px48Z67bXXNGzYML333nvKy8vzCEeSrPsul+uy7S6Xq8LjCw9vcuUHVUJN9VvdAp0BkiSnM0Ayxvp9pW2VbZekiAjfvjd1pTb+ivrYF7WxL2pTNlsHpA8//FBt27bVLbfc4rF9+fLlKiws1FVXXSVJmjt3ru666y598sknCg4OLhV2Su43bNhQwcHBOn36dKn2Sxd7l9fJk+fktXyqShyOix/U6u63phS6iy7ecDgu3i75faVtlW2XlJt7rhZmVlpdq42/oT72RW3syx9rUzLn8rB1QPr73/+un//856W2BwUFeewFCg4OVvPmzZWdna2OHTvq1KlTcrvdcjovTi8nJ0cNGzZUaGiooqKitH//fo/+cnNzSx12Kw9jVCMfqprqtz7w9ftCbeyN+tgXtbEvalM2W1wHqSzGGH311Vfq2LFjqe09e/bU22+/bW27cOGCDh8+rJYtW6pt27ZyOp0ei67T0tLUvn17NWjQQDExMdq9e7fy8/M92mNiYmp8TgAAoG6wbUDKysrS999/X+rwmsPhULdu3fTyyy8rNTVV+/bt04QJE3TdddfprrvuUkhIiAYMGKCpU6dq586d2rp1q1asWGFdQ6lTp05q1qyZkpKStG/fPi1dulQ7d+7UoEGDfDFNAABgQ7Y9xHby5ElJ0tVXX12qbfz48XI6nRo7dqzOnz+vLl26aOnSpQoIuLhoOCkpSVOnTtXQoUPVuHFjjR49Wr1795YkBQQEaPHixZo0aZIGDhyoG2+8UYsWLeIikXVAYKBTsyenqLDQzTWRAAA1ymG8r9KIcsvNrf5F2hERTaq935qS/PQMSReDS2Gh2/p9pW1VbS8sdGvavCm1Ote6Vht/Q33si9rYlz/WpmTO5WHbQ2ywt3nJ8309BAAAagwBCZVS6Cr02WsHBjo1L3k+V9YGANQYAhLqpEJXoVw+DGkAgPqNgIQ6KzDQyV4kAECNICChTmMvEgCgJhCQUKcFBtr2ShUAgDqMgIR6gUNtAIDqREBChc2enOLrIZRijCEkAQCqDQEJFWandT8lV9eW7DUuAEDdRkBChdhx7xHBCABQ3QhIqBDCCADAHxCQAAAAvBCQAAAAvBCQUG9wZW0AQHUhIKFeYY0UAKA6EJBQr7AXCQBQHQhIKLd5yfN9PYQfVfK1I+xFAgBUFQEJ5VZI8AAA+AkCEgAAgBcCEgAAgBcCEsrFjl8xcjks1AYAVBUBCeVS1xY+G2MISQCASiMgod7hbDYAQFURkAAAALwQkAAAALwQkAAAALwQkOCBhc0AABCQ4IWFzQAAEJAAAABKISDhiuz+JbUAAFQ3WwQkl8ul/v37KzU11do2ffp0tW7d2uNn7dq1VvuWLVvUs2dPxcTEaOTIkfruu++sNmOM5s6dqy5duqhTp05KSUlRcXGx1X7q1CmNHj1acXFx6tGjh955553amWgdxZfUAgD8jdPXAygoKNDYsWO1b98+j+0HDhzQ2LFjdd9991nbGjduLEnauXOnJk2apOeee05t2rTRjBkzlJSUpCVLlkiSVq5cqS1btmjhwoVyu90aP368wsPDNWLECElSUlKS8vPztX79emVkZOiZZ55RixYt1KFDh1qaNQAAsDOf7kHav3+/HnjgAX377bel2g4cOKDbbrtNkZGR1k9ISIgkae3atbrnnns0YMAAtWnTRikpKfrrX/+qzMxMSdKaNWs0ZswYJSQkqEuXLho3bpzWrVsnSfr222/1ySefaPr06WrVqpV+9atf6Re/+IVef/312ps4agXfyQYAqCyfBqTt27erc+fOWr9+vcf28+fPKzs7WzfddFOZz8vIyFBCQoJ1v1mzZoqOjlZGRoays7N17Ngx3XHHHVZ7fHy8srKydOLECWVkZKhZs2Zq3ry5R/uXX35ZvZODLRhjfD0EAEAd5NNDbIMHDy5z+4EDB+RwOPTqq6/qb3/7m8LCwvTII49Yh9tOnDihpk2bejwnPDxcx48fV05OjiR5tEdEREiS1V7Wc7Ozs6ttXgAAoG7z+Rqkshw8eFAOh0MtW7bUQw89pB07dujZZ59V48aN1atXL+Xn5ysoKMjjOUFBQXK5XMrPz7fuX9omXVwMnpeXd9nnVpTDUeGnlKu/6u63suOQpOefTfHdQKpJdbyfdqkNykZ97Iva2Jc/1qYic7VlQBowYIC6d++usLAwSVKbNm30zTff6I033lCvXr0UHBxcKtC4XC6FhIR4hKHg4GDrtiSFhIRc9rkNGzas8DjDw5tU+Dm+7Lc8Ap0Bioj4/69viovlDHRKxsjpDJB+OGRV1u3ybqvpdo9tgU7Nema25r06vVreH1/WBldGfeyL2tgXtSmbLQOSw+GwwlGJli1batu2bZKkqKgo5ebmerTn5uYqMjJSUVFRkqScnBxrnVHJYbeS9ss9t6JOnjyn6lzi4nBc/KBWd78VUeguUm7uOY/7cjg8f/8w2FK3y7utptu9tl24kO8xp8qwQ21wedTHvqiNffljbUrmXB62DEgvvfSSvvzyS61atcratnfvXrVs2VKSFBMTo7S0NA0cOFCSdOzYMR07dkwxMTGKiopSdHS00tLSrICUlpam6OhoNW3aVLGxscrKytLx48d13XXXWe2xsbEVHqcxqpEPVU31W5HXl6TZk+v+4bUS1fV++ro2+HHUx76ojX1Rm7LZ4kKR3rp3764dO3Zo+fLl+vbbb/X6669r8+bNGj58uCTpwQcf1DvvvKONGzdq7969mjBhgrp166YbbrjBap87d65SU1OVmpqqefPmaciQIZKkG264QYmJiRo/frz27t2rjRs3asuWLfrNb37js/naVX35XjZO9wcAVJQt9yB16NBBL730khYsWKCXXnpJ119/vebNm6e4uDhJUlxcnKZNm6YFCxbozJkzuvPOO5WcnGw9f8SIETp58qRGjRqlgIAADRo0SMOGDbPaU1JSNGnSJD3wwAOKjIzUzJkzuUhkPVdfwh4AoHY4DBeKqbTc3OpfgxQR0aTa+62IyWOf07R5UyRJyU/PkHRxD0xhodv6fem2K7VX5jk19ZoOh0PPzJpYqffFDrXB5VEf+6I29uWPtSmZc3nY8hAbUBPYiwQAKC8CEgAAgBcCEgAAgBcCEgAAgBcCEgAAgBcCEvxGo0YNuR4SAKBcCEjwK5zJBgAoDwISAACAFwIS/Ep5vnaEw3AAAAIS/I4x5kdDEIfhAAAEJJRpXvJ8Xw+hRhGCAAA/hoCEMhUSIAAAfoyABL8TGOj09RAAADZHQAIAAPBCQAIAAPBCQIKHwECnZk9O8fUwAADwKQISSuEMLwCAvyMgwS+V54KRAAD/RUCC32JPGQDgcghIAAAAXghIAAAAXghI8FusQwIAXA4BCX7NGOPrIQAAbIiABAAA4IWABAAA4IWABL/GOiQAQFkISPB7XA8JAOCNgAS/x14kAIA3AhIs85Ln+3oIPsNeJADApQhIsBQSEgAAkERAAgAAKMUWAcnlcql///5KTU21tqWnp+vXv/614uLidPfdd2vjxo0ez/nFL36h1q1be/x8/fXXki5e/G/u3Lnq0qWLOnXqpJSUFBUXF1vPPXXqlEaPHq24uDj16NFD77zzTu1MFLbFOiQAwKWcvh5AQUGBxo4dq3379lnbcnJy9F//9V968MEH9fzzz2v37t1KSkpSZGSkunXrpqKiIn3zzTdau3atbrrpJut511xzjSRp5cqV2rJlixYuXCi3263x48crPDxcI0aMkCQlJSUpPz9f69evV0ZGhp555hm1aNFCHTp0qNW5w15YhwQAKOHTgLR//36NHTu21Nc9bN26VREREXryySclSTfddJNSU1P17rvvqlu3bjpy5IgKCwvVoUMHBQcHl+p3zZo1GjNmjBISEiRJ48aN00svvaQRI0bo22+/1SeffKKPP/5YzZs3V6tWrZSenq7XX3+dgAQAACT5+BDb9u3b1blzZ61fv95je9euXTVr1qxSjz9//ryki8GqWbNmZYaj7OxsHTt2THfccYe1LT4+XllZWTpx4oQyMjLUrFkzNW/e3KP9yy+/rK5pAQCAOs6ne5AGDx5c5vbmzZt7BJiTJ0/qvffe0+jRoyVJBw4cUGBgoB5//HHt2rVLLVq00IQJE9ShQwfl5ORIkpo2bWo9PyIiQpJ0/Phx5eTkeLRJUnh4uLKzsys8foejwk8pV3/V3S/KJzDQedkaUBt7oz72RW3syx9rU5G5+nwN0pXk5+dr9OjRioiI0H/+539Kkg4dOqQzZ87oV7/6lcaMGaMNGzZo6NChev/995Wfny9JCgoKsvooue1yuZSXl+fRVtLucrkqPLbw8CaVnZZP+r0SpzNA+uFwZ1m3K9NeE33W1GtKUkREEwU6AxQRUXYNfFUblA/1sS9qY1/Upmy2Dkjff/+9nnjiCX3zzTd6/fXXFRISIklKTk5Wfn6+GjduLEmaOnWqvvjiC73zzjv66U9/KuliGCo5BFcSfkJCQhQcHFwqDLlcLjVs2LDC4zt58py8lk9VicNx8YNa3f2Wl9tdpEJ3kTWYUrfL2nal9so8x1evKSk395wK3UXKzT3n8d74ujb4cdTHvqiNffljbUrmXB62DUjnz5/Xo48+qm+//VarV6/2OFvN6XRa4UiSHA6HWrZsqezsbEVFRUm6eCZcyWG6ksNukZGRioqKUm5ursdr5ebmKjIyssJjNEY18qGqqX5xZSXv++Xef2pjb9THvqiNfVGbstniOkjeiouLNWrUKB05ckR//OMfdeutt3q0P/zww1q4cKHH4//973+rZcuWioqKUnR0tNLS0qz2tLQ0RUdHq2nTpoqNjVVWVpaOHz/u0R4bG1vj87Kz2ZNTfD0EnwsMdPI+AAAk2XQP0ltvvaXU1FS98sorCg0NtfYABQYGKiwsTD169NCiRYvUtm1btWjRQmvWrNG5c+d03333SZIefPBBzZ07V9ddd50kad68eRo+fLgk6YYbblBiYqLGjx+vSZMm6auvvtKWLVu0du1a30zWJlyuQgUG2vLjUKu4FhIAQLJpQPrwww9VXFysxx9/3GN7p06d9Mc//lHDhg1TQUGBpk+frtzcXMXExGjlypXWYbcRI0bo5MmTGjVqlAICAjRo0CANGzbM6iclJUWTJk3SAw88oMjISM2cOZNrIAEAAIvDeF+lEeWWm1v9i7QjIppUe7/lkfz0DAUGOlVY6JakMm9Xpr0m+qzp1ywsdGvavCke748va4Mroz72RW3syx9rUzLn8rDlGiTAlzjUCAAgIAEAAHghIAEAAHip9oD03XffVXeXAAAAtapSAalt27ZlBqGsrCz9/Oc/r/KgAF8KDHRqetJMXw8DAOBD5V6NunnzZr399tuSJGOMRo4cqcDAQI/HnDhxolJXpAbshushAYB/K3dA6tWrl44cOSJJ2r59u2JjY3XVVVd5PKZRo0bq1atX9Y4QAACglpU7IF111VUaNWqUJOn6669X3759rS+DRd02L3m+r4cAAICtVOqCL/fdd58OHz6sXbt2qbCw9KGIAQMGVHVcqCWzJ6eIa4UCAOCpUgFp2bJlmjt3rq6++upSh9kcDgcBqQ7hO9h+3PSkmXpm1kRfDwMAUMsq9S/jihUrNH78eI0YMaK6xwPYCou1AcA/Veo0/4KCAvXu3bu6xwLYRmCgU7Mnp/h6GAAAH6lUQLr33nv1+uuvs3aljiMA/Dj2HgGA/6rUIbbz58/rrbfe0pYtW9S8efNS10Nas2ZNtQwONYsAcGWszwIA/1Sp//rfdNNN+u1vf1vdYwEAALCFSgWkkushAfVdydeOPPs8Z7IBgD+pVEBKSkr60fZZs2ZVajCAHXEoEgD8T6UWaXtzu906dOiQ3n//fV177bXV0SUAAIDPVGoP0uX2EC1btkxff/11lQYEAADga9WyB6lEnz599NFHH1VnlwAAALWu2gLShQsXtGHDBl1zzTXV1SVgC4GBTiU/PdPXwwAA1KJKHWJr06aNHA5Hqe3BwcGaPn16lQcF2I0xRk+NnqrxU8b6eigAgFpQqYDkfSFIh8OhwMBA3XLLLWrcuHG1DAywGzdnswGA36hUQOrUqZMk6ZtvvtGBAwdUXFysFi1aEI4AAEC9UKmAdPbsWSUlJenjjz/W1VdfraKiIn3//fe64447tGjRIjVp0qS6xwkAAFBrKrVIe/r06Tp+/Ljef/99paam6v/+7//07rvv6sKFC1wkEgAA1HmVCkj/+7//q6lTp6ply5bWtltuuUWTJ0/Wxx9/XG2DAwAA8IVKBaTg4GA1aFD6qQ6HQ0VFRVUeFGrevOT5vh5CnTU9iVP+AaC+q1RA6tGjh5577jl9++231rZvvvlG06dP11133VVtg0PNKeSMrApzBjr1/LMpfDcbAPiBSi3SHj9+vEaOHKm7775boaGhkqQzZ87oZz/7mZ599tlqHSBgJ4QjAPAPFd6DdPjwYYWEhOiPf/yjNm/erClTpmjmzJl67733tHTpUoWFhVV4EC6XS/3791dqaqq1LTMzU8OGDVNsbKz69u2rzz77zOM5//znP9W/f3/FxMRoyJAhyszM9GhftWqVunbtqri4OE2cOFF5eXlWW0FBgSZOnKiEhAQlJiZqxYoVFR4zAACov8odkIwxmj59uu655x59+eWXkqTWrVurb9++2rRpk/r376/nn39expgKDaCgoEBPPvmk9u3b5/FaI0eOVEREhDZt2qRf/vKXGjVqlI4ePSpJOnr0qEaOHKmBAwfqrbfe0rXXXqsnnnjCeu0PP/xQCxcu1LRp07R69WplZGRozpw5Vv8pKSnatWuXVq9erSlTpmjhwoX64IMPKjRuAABQf5U7IK1Zs0bvv/++Fi1aZF0ossTixYu1aNEi/elPf9Ibb7xR7hffv3+/HnjgAY+1TJK0bds2ZWZmatq0abr55pv1+OOPKzY2Vps2bZIkbdy4UbfffruGDx+uW2+9VbNmzVJWVpa2b99ujXXo0KHq3r27OnTooOeee06bNm1SXl6eLly4oI0bN2rSpElq166devXqpUcffVTr1q0r97jh3wIDnSzUBoB6rtwBacOGDXr22WfVvXv3Mtt79OihcePGVSggbd++XZ07d9b69es9tmdkZOi2225To0aNrG3x8fFKT0+32hMSEqy2kJAQtWvXTunp6SoqKtJXX33l0R4bG6vCwkLt3btXe/fuldvtVlxcnEffGRkZKi4uLvfY4b8CA52sRQKAeq7ci7SzsrLUoUOHH31Mly5dNGPGjHK/+ODBg8vcnpOTo6ZNm3psCw8P1/Hjx6/YfvbsWRUUFHi0O51OhYWF6fjx42rQoIGuueYaBQUFWe0REREqKCjQ6dOnde2115Z7/HXV7Mkpvh4CAAC2Vu6AFB4erqysLF1//fWXfczx48crtUjbW15enkeAkaSgoCC5XK4rtufn51v3y2o3xpTZJsnqv7wcjgo9vNz9VXe/3lyuQgUGVuoERr9VVm1quk4ov9r620HFURv78sfaVGSu5f5XslevXnr55Ze1YsUKBQYGlmp3u91auHChEhMTy//qlxEcHKzTp097bHO5XGrYsKHV7h1mXC6XQkNDFRwcbN33bg8JCVFRUVGZbZKs/ssrPLxmvnOupvotEegMkNMZIP2wqL3kdlnbqtpeE3366jUlyRlw8b0LdAYoIoLvHLSbmv7bQeVRG/uiNmUrd0B64oknNGjQIA0cOFAPP/ywbr/9djVp0kRnzpzR7t27tXbtWn3//fdKSan64ZuoqCjt37/fY1tubq512CwqKkq5ubml2tu2bauwsDAFBwcrNzdXN998s6SL4e306dOKjIyUMUanTp2S2+2W03lx+jk5OWrYsKF1TafyOnny3KX/dlaZw3Hxg1rd/XordBdJDsfF3z+88GW3VbW9Jvr0wWs6HBcvFOku+v/tubnnKl8EVKva+ttBxVEb+/LH2pTMuTzKHZBCQ0O1YcMGzZ07V88//7x1XSFjjJo0aaK+fftq9OjRioiIqNyoLxETE6OlS5cqPz/f2quTlpam+Ph4qz0tLc16fF5envbs2aNRo0apQYMGat++vdLS0tS5c2dJUnp6upxOp9q0aXNx0k6n0tPTrYXcaWlpat++fZlfn/JjjFGNfKhqql9UXkk9Lq0LNbIf/nbsi9rYF7UpW4UWooSFhWn69OmaPHmyMjMzdfbsWYWFhek//uM/FBAQUG2D6tSpk5o1a6akpCQ98cQT+uSTT7Rz507NmjVLknT//fdr+fLlWrp0qbp3765FixapefPmViAaPHiwJk+erFatWqlp06aaOnWqHnjgAYWEhEiSBgwYoKlTp2rmzJk6ceKEVqxYYfUNlEfJqf7PzJro66EAAGpApVbqBgUFWYevakJAQIAWL16sSZMmaeDAgbrxxhu1aNEiRUdHS5KaN2+ul19+WTNnztSiRYsUFxenRYsWyfHD6qt+/fopKytLkydPlsvlUu/evTV+/Hir/6SkJE2dOlVDhw5V48aNNXr0aPXu3bvG5mMnnMFWfTjVHwDqL4ep6KWvYcnNrf41SBERTaq930slP33xMgyBgU4VFro9bpe1rartNdGnr14zJCRYeXkF1rbCQremzZtSPYVBldTG3w4qh9rYlz/WpmTO5VHh72IDcBFX1AaA+ouABFQBh9kAoH4iIAEAAHghIAFVwBXJAaB+IiABVcA6JAConwhIQBWxDgkA6h8CEgAAgBcCEgAAgBcCEgAAgBcCElBFLNQGgPqHgORH5iXP9/UQ6i1jDCEJAOoRApIfKeRsqxrF2WwAUH8QkIBqwAUjAaB+ISABAAB4ISABAAB4ISABAAB4ISAB1YTT/QGg/iAgAdWIM9kAoH4gIAEAAHghIPmJ2ZNTfD0EAADqDAKSn+DQDwAA5UdAAgAA8EJAAqpRYKCTw5kAUA8QkIBqxhfXAkDdR0ACagBrvgCgbiMgAQAAeCEg+YF5yfN9PQS/w1W1AaBuIyD5gUIO9/gEh9kAoO4iIAEAAHghIAEAAHghIAE1pOSaSKxFAoC6x7YB6e2331br1q1L/bRp00aS9Lvf/a5U2yeffGI9f9WqVeratavi4uI0ceJE5eXlWW0FBQWaOHGiEhISlJiYqBUrVtT6/OAfjDGsRQKAOsjp6wFcTt++fdW1a1frvtvt1tChQ9WtWzdJ0oEDBzRnzhz95Cc/sR5z9dVXS5I+/PBDLVy4UHPmzFF4eLiSkpI0Z84cTZ48WZKUkpKiXbt2afXq1Tp69KieeuopRUdHq0+fPrU3wVrCVZ0BAKg42wakhg0bqmHDhtb9JUuWyBijcePGyeVy6ciRI2rfvr0iIyNLPXfNmjUaOnSounfvLkl67rnnNGLECI0fP17GGG3cuFGvvfaa2rVrp3bt2mnfvn1at25dvQxILlehAgNtW2YAAGzJtofYLnX69Gm99tprGjt2rIKCgnTw4EE5HA7dcMMNpR5bVFSkr776SgkJCda22NhYFRYWau/evdq7d6/cbrfi4uKs9vj4eGVkZKi4uLhW5gMAAOytTuxaeOONN9S0aVNrD8/BgwfVuHFjTZgwQdu3b9d1112n0aNH66677tLZs2dVUFCgpk2bWs93Op0KCwvT8ePH1aBBA11zzTUKCgqy2iMiIlRQUKDTp0/r2muvLfe4HI7qm+Ol/VV3v6i6qtaGmtYs/nbsi9rYlz/WpiJztX1AKjkk9uijj1rbDh48qPz8fCUmJuqxxx7TRx99pN/97ndav369IiIiJMkjAJXcd7lcMsaU2SZJLperQmMLD29SmSnVar+BzgA5nQGSMdZvSWXeron2+vSakuQMCKhwn4HOAEVE1MxnBZ5q6m8SVUdt7IvalM32Aemrr75Sdna2+vXrZ2174okn9PDDD1uLstu0aaPdu3drw4YN+u///m9JpcOOy+VSSEiIioqKymyT5LHmqTxOnjx36b+dVeZwXPygVme/he4iyeHw/P3Di5W6XRPt9eQ1HQ7JGeiUu6jirymHQ79/NEnPPj+x/IVDhdTE3w6qB7WxL3+sTcmcy8P2Aenvf/+7EhISrDAkSQ0aNPC4L0ktW7bU/v37FRYWpuDgYOXm5urmm2+WdPEMuNOnTysyMlLGGJ06dUput1tO58Xp5+TkqGHDhgoNDa3Q2IxRjXyoaqpfVF5JPSpbF5erkJrWAv527Iva2Be1KZvtF2nv3LlTHTt29Nj29NNPKykpyWPb3r171bJlSzVo0EDt27dXWlqa1Zaeni6n06k2bdqobdu2cjqdSk9Pt9rT0tLUvn17NWhg+7cDAADUAtsngn379umWW27x2NajRw+9++672rx5sw4fPqyFCxcqLS1NDz30kCRp8ODBWr58ubZu3aqdO3dq6tSpeuCBBxQSEqKQkBANGDBAU6dO1c6dO7V161atWLFCQ4YM8cX0AACADdn+EFtubm6pQ1+9e/fWlClT9Morr+jo0aO69dZbtWzZMjVv3lyS1K9fP2VlZWny5MlyuVzq3bu3xo8fbz0/KSlJU6dO1dChQ9W4cWONHj1avXv3rtV51QYuEmkfXIsKAOoWhzEceays3NzqX6QdEdGk2vpNfnqGpIv/OBcWuq3fl26r6fb69JohIcHKyyuodJ9ud5GemcVC7ZpQ3X87qD7Uxr78sTYlcy4P2x9iA+oLvpMNAOoOAhIAAIAXAhJQSwIDnZqeNNPXwwAAlAMBCahFHGYDgLqBgAQAAOCFgATUosBAJ5dfAIA6gIAE1DIOswGA/RGQAAAAvBCQAAAAvBCQgFrG6f4AYH8EJMAHStYhEZQAwJ4ISIAPlJzNVtaCbUITAPgeAQnwkcudzcZZbgDgewSkempe8nxfDwEAgDqLgFRPFbIXAgCASiMgAT4UGOj09RAAAGUgIAE+xCn/AGBPBCTAx1iUDQD2Q0ACfIwvsAUA+yEg1UP8Y1v3sBcJAOyFgFQP8Y8tAABVQ0ACbIDF2gBgLwQkwCaMMRweBQCbICABNsLhUQCwBwISAACAFwJSPcN3sAEAUHUEpHqG72ADAKDqCEiAzTRq1JAz2gDAxwhIgA2xWBsAfIuABAAA4MXWAemjjz5S69atPX7GjBkjSdqzZ49+9atfKSYmRvfff7927drl8dwtW7aoZ8+eiomJ0ciRI/Xdd99ZbcYYzZ07V126dFGnTp2UkpKi4uLiWp0b8GNKvp+NQ20A4Bu2Dkj79+9X9+7d9dlnn1k/06dP14ULF/TYY48pISFBb7/9tuLi4vT444/rwoULkqSdO3dq0qRJGjVqlNavX6+zZ88qKSnJ6nflypXasmWLFi5cqAULFujdd9/VypUrfTVNoEwuVyGH2gDAR2wdkA4cOKBWrVopMjLS+gkNDdX777+v4OBgTZgwQTfffLMmTZqkq666Sh988IEkae3atbrnnns0YMAAtWnTRikpKfrrX/+qzMxMSdKaNWs0ZswYJSQkqEuXLho3bpzWrVvny6kCAAAbsX1Auummm0ptz8jIUHx8vBwOhyTJ4XCoY8eOSk9Pt9oTEhKsxzdr1kzR0dHKyMhQdna2jh07pjvuuMNqj4+PV1ZWlk6cOFGj86lpfE0FAADVw7YByRijQ4cO6bPPPtPdd9+tnj17au7cuXK5XMrJyVHTpk09Hh8eHq7jx49Lkk6cOHHZ9pycHEnyaI+IiJAk6/l1FYdj6p/AQKevhwAAfsm2//U9evSo8vLyFBQUpBdffFFHjhzR9OnTlZ+fb22/VFBQkFwulyQpPz//su35+fnW/UvbJFnPL68fdmBVm5L+qrtfVJ0va8Pn4cr427EvamNf/libiszVtgHp+uuvV2pqqq6++mo5HA61bdtWxcXFGj9+vDp16lQqzLhcLjVs2FCSFBwcXGZ7SEiIRxgKDg62bktSSEhIhcYYHt6kUnOrqX4DnQFyOgMkY6zfkn50W02316fXlCRnQEDtzjPQqVnPzNa8V6eX/4Pgx2rqbxJVR23si9qUzbYBSZLCwsI87t98880qKChQZGSkcnNzPdpyc3Otw2ZRUVFltkdGRioqKkqSlJOTo+bNm1u3JSkyMrJC4zt58tyl/3ZWmcNx8YNa2X4L3UWSw+H5+4eOL7utptvryWs6HBfDirvIB/MsdOv3jybp2ecnll14VPlvBzWH2tiXP9amZM7lYds1SH//+9/VuXNn5eXlWdv+9a9/KSwsTPHx8fryyy9lfqioMUZffPGFYmJiJEkxMTFKS0uznnfs2DEdO3ZMMTExioqKUnR0tEd7WlqaoqOjS61buhJjqv+nsv0+/ywLtGvSpbXxzesbJT89s0Y+c/Xlp6Q+/Njvh9rY98cfa1Netg1IcXFxCg4O1jPPPKODBw/qr3/9q1JSUvToo4+qT58+Onv2rGbMmKH9+/drxowZysvL0z333CNJevDBB/XOO+9o48aN2rt3ryZMmKBu3brphhtusNrnzp2r1NRUpaamat68eRoyZIgvp1tlLNCu/6gxANQe2x5ia9y4sZYvX66ZM2fq/vvv11VXXaVf//rXevTRR+VwOLRkyRJNmTJFGzZsUOvWrbV06VI1atRI0sVwNW3aNC1YsEBnzpzRnXfeqeTkZKvvESNG6OTJkxo1apQCAgI0aNAgDRs2zEczBSpmetJMPTOLw20AUJNsG5Ak6dZbb73sFa47dOigP/3pT5d97sCBAzVw4MAy2wICApSUlORxdW3AzgIDnSosdEtiTxIA1AbbHmID4Knk+9kAADWPgATUIS5XIRePBIBaQEACAADwQkACAADwQkAC6pjAQKemJ8309TAAoF4jIAF1EGeyAUDNIiABdRB7kQCgZhGQgDqKvUgAUHMISAAAAF4ISAAAAF4ISPXAvOT5vh4CfIALRgJAzSEg1QOFrEXxSyzUBoCaQ0AC6jAWagNAzSAgAQAAeCEg1XF8u7t/4zAbANQMAlIdxyEWGGMISQBQzQhIQD1AUAaA6kVAAgAA8EJAAgAA8EJAqsNYoA0AQM0gINVhrDtBCa6qDQDVi4AEAADghYAE1ANcDwkAqhcBCagnOOQKANWHgAQAAOCFgATUExxmA4DqQ0AC6hEOswFA9SAg1VHzkuf7eggAANRbBKQ6qpA9BSgDh9kAoHoQkIB6hsNsAFB1BCQAAAAvtg5I2dnZGjNmjDp16qSuXbtq1qxZKigokCRNnz5drVu39vhZu3at9dwtW7aoZ8+eiomJ0ciRI/Xdd99ZbcYYzZ07V126dFGnTp2UkpKi4uLiWp8fAACwJ9t+gZMxRmPGjFFoaKjWrVunM2fOaOLEiWrQoIGeeuopHThwQGPHjtV9991nPadx48aSpJ07d2rSpEl67rnn1KZNG82YMUNJSUlasmSJJGnlypXasmWLFi5cKLfbrfHjxys8PFwjRozwyVyB6sT3sgFA1dl2D9LBgweVnp6uWbNm6dZbb1VCQoLGjBmjLVu2SJIOHDig2267TZGRkdZPSEiIJGnt2rW65557NGDAALVp00YpKSn661//qszMTEnSmjVrNGbMGCUkJKhLly4aN26c1q1b57O5AgAAe7FtQIqMjNSyZcsUERHhsf38+fM6f/68srOzddNNN5X53IyMDCUkJFj3mzVrpujoaGVkZCg7O1vHjh3THXfcYbXHx8crKytLJ06cqJG5VLfZk1N8PQQAAOo12+6LDw0NVdeuXa37xcXFWrt2rbp06aIDBw7I4XDo1Vdf1d/+9jeFhYXpkUcesQ63nThxQk2bNvXoLzw8XMePH1dOTo4kebSXhLDjx4+Xet6PcTgqPb0f7e9K/bpchRxGqWXlrY1d1JVxVpe6Vh9/Qm3syx9rU5G51pl/ZefMmaM9e/borbfe0u7du+VwONSyZUs99NBD2rFjh5599lk1btxYvXr1Un5+voKCgjyeHxQUJJfLpfz8fOv+pW2S5HK5KjSm8PAmVZxV5foNdAbI6QyQjJGkMm+Xd1tNt9en15QkZ0DZ772d5ilJERE189m0u5r6m0TVURv7ojZlqxMBac6cOVq9erVeeOEFtWrVSrfeequ6d++usLAwSVKbNm30zTff6I033lCvXr0UHBxcKuy4XC6FhIR4hKHg4GDrtiRrDVN5nTx57tJ/k6rM4bj4Qb1Sv4XuIsnhuPj7hyeWul3ebTXdXk9e0+GQnIFOuYvsP8/AQKd+/2iSnn1+ovxFef92UPuojX35Y21K5lwetg9IycnJeuONNzRnzhzdfffdkiSHw2GFoxItW7bUtm3bJElRUVHKzc31aM/NzVVkZKSioqIkSTk5OWrevLl1W7q47qkijFGNfKhqql9UXkk96kpdXK7COjPW6sTfjn1RG/uiNmWz7SJtSVq4cKHefPNNzZ8/X/369bO2v/TSSxo2bJjHY/fu3auWLVtKkmJiYpSWlma1HTt2TMeOHVNMTIyioqIUHR3t0Z6Wlqbo6OgKrT8C7IyvHAGAqrHtHqQDBw5o8eLFeuyxxxQfH2/t5ZGk7t27a+nSpVq+fLl69eqlzz77TJs3b9aaNWskSQ8++KAefvhhxcbGqn379poxY4a6deumG264wWqfO3eurrvuOknSvHnzNHz48NqfJFCD+MoRAKg82wakjz/+WEVFRXrllVf0yiuveLT9+9//1ksvvaQFCxbopZde0vXXX6958+YpLi5OkhQXF6dp06ZpwYIFOnPmjO68804lJydbzx8xYoROnjypUaNGKSAgQIMGDSq1RwoAAPgv2wakxx57TI899thl23v27KmePXtetn3gwIEaOHBgmW0BAQFKSkpSUlJSlccJAADqH1uvQUJp85Ln+3oIqCO4VhYAVB4BqY4pZF0JAAA1joAEAADghYAEAADghYAE1FNcCwkAKo+ABNRjXAsJACqHgAQAAOCFgATUYxxmA4DKISAB9RyH2QCg4ghIdcjsySm+HgLqIPYiAUDFEZDqEPYEoLL47ABAxRCQAAAAvBCQAAAAvBCQAD/AF9cCQMUQkAAAALwQkAAAALwQkOoITvFHVXCqPwBUDAGpjuA0bVQVnyEAKD8CEuAn2IsEAOVHQAL8CHuRAKB8CEgAAABeCEiAH+EwGwCUDwEJ8DMcZgOAKyMgAX6Gq2oDwJURkOqAecnzfT0EAAD8CgGpDijkkAiqEeuQAODKCEiAHzLGEJIA4EcQkGyOrxhBTWGxNgBcHgHJ5vhHDDWFxdoAcHkEJMBPsRYJAC7PbwNSQUGBJk6cqISEBCUmJmrFihW+HpKHecnzObyGGsdaJAAom98GpJSUFO3atUurV6/WlClTtHDhQn3wwQe+Hpal0FXI4TXUCkISAJTml4sQLly4oI0bN+q1115Tu3bt1K5dO+3bt0/r1q1Tnz59fD08oNYZY3w9BACwFb/cg7R371653W7FxcVZ2+Lj45WRkaHi4mIfjgzwjcBAp2ZPTmFPEgD8wC8DUk5Ojq655hoFBQVZ2yIiIlRQUKDTp0/7bmCAD7lchTLGaF7yfIISAL/nl4fY8vLyPMKRJOu+y+Uqdz8NGkjVeWTC4bj4e2HKIjUOvUruwiJJkjMwoNTtsrZdqb0yz+E1L952OKSg4CAFBgUqwFl/51lyOyAgQK++sERut/viNqdTbrfb+n3pNu/2/LwCPfnMGC2YvUglStrdhUV68pkx1vb50xdY9y+9fSXejy3526nuv0lUHbWxL3+sTcmcy/VY44eLD/785z9r+vTp+sc//mFtO3DggPr27avU1FSFhYX5bnAAAMDn/PIQW1RUlE6dOmX936908bBbw4YNFRoa6sORAQAAO/DLgNS2bVs5nU6lp6db29LS0tS+fXs1aOCXbwkAALiEX6aBkJAQDRgwQFOnTtXOnTu1detWrVixQkOGDPH10AAAgA345Rok6eJC7alTp+ovf/mLGjdurBEjRmjYsGG+HhYAALABvw1IAAAAl+OXh9gAAAB+DAEJAADACwEJAADACwHJJgoKCjRx4kQlJCQoMTFRK1as8PWQ6j2Xy6X+/fsrNTXV2paZmalhw4YpNjZWffv21WeffebxnH/+85/q37+/YmJiNGTIEGVmZnq0r1q1Sl27dlVcXJwmTpyovLy8WplLfZKdna0xY8aoU6dO6tq1q2bNmqWCggJJ1MfXDh8+rBEjRiguLk7dunXTsmXLrDZqYx+PPfaYnn76aev+nj179Ktf/UoxMTG6//77tWvXLo/Hb9myRT179lRMTIxGjhyp7777zmozxmju3Lnq0qWLOnXqpJSUFP/5zlIDW5g2bZq59957za5du8xf/vIXExcXZ/785z/7elj1Vn5+vhk5cqRp1aqV2bZtmzHGmOLiYnPvvfeasWPHmv3795tXX33VxMTEmKysLGOMMVlZWSY2NtYsX77cfP311+b3v/+96d+/vykuLjbGGPPBBx+Y+Ph487//+78mIyPD9O3b1zz33HM+m2NdVFxcbB544AHz6KOPmq+//trs2LHD9OrVyzz//PPUx8eKiopM7969zdixY82hQ4fMp59+ajp27Gj+53/+h9rYyJYtW0yrVq3MU089ZYwx5vvvvzd33nmnef75583+/ftNcnKy+elPf2q+//57Y4wxGRkZpkOHDuZPf/qT+de//mUeeugh89hjj1n9LV++3Nx1111mx44d5vPPPzeJiYlm2bJlPplbbSMg2cD3339v2rdvb/1DbYwxixYtMg899JAPR1V/7du3z/ziF78w9957r0dA+uc//2liY2Ot/3AYY8zQoUPNggULjDHGvPjiix41uXDhgomLi7OeP3jwYOuxxhizY8cO06FDB3PhwoXamFa9sH//ftOqVSuTk5NjbXv33XdNYmIi9fGx7Oxs8/vf/96cO3fO2jZy5EgzZcoUamMTp06dMj/72c/M/fffbwWkjRs3mh49elhhtLi42PTq1cts2rTJGGPM+PHjrccaY8zRo0dN69atzbfffmuMMeauu+6yHmuMMZs3bzbdu3evrSn5FIfYbGDv3r1yu92Ki4uztsXHxysjI8N/dmXWou3bt6tz585av369x/aMjAzddtttatSokbUtPj7euuJ6RkaGEhISrLaQkBC1a9dO6enpKioq0ldffeXRHhsbq8LCQu3du7dmJ1SPREZGatmyZYqIiPDYfv78eerjY02bNtWLL76oxo0byxijtLQ07dixQ506daI2NjF79mz98pe/1C233GJty8jIUHx8vBw/fEurw+FQx44dL1ubZs2aKTo6WhkZGcrOztaxY8d0xx13WO3x8fHKysrSiRMnamdSPkRAsoGcnBxdc801CgoKsrZFRESooKBAp0+f9t3A6qnBgwdr4sSJCgkJ8diek5Ojpk2bemwLDw/X8ePHr9h+9uxZFRQUeLQ7nU6FhYVZz8eVhYaGqmvXrtb94uJirV27Vl26dKE+NtKjRw8NHjxYcXFxuvvuu6mNDXz++ef6v//7Pz3xxBMe269UmxMnTly2PScnR5I82kv+58UfakNAsoG8vDyPcCTJuu9yuXwxJL90uTqU1ODH2vPz8637l3s+Km7OnDnas2eP/vu//5v62MiCBQv06quv6l//+pdmzZpFbXysoKBAU6ZM0eTJk9WwYUOPtivVJj8/v0K18ad/m5y+HgCk4ODgUh+2kvveH3bUnODg4FJ77Fwul1WDy9UpNDRUwcHB1n3vdu89VSifOXPmaPXq1XrhhRfUqlUr6mMj7du3l3TxH+Zx48bp/vvvL3XWGbWpPQsXLtTtt9/usfe1xOXe+yvVJiQkxCMMedfJH2rDHiQbiIqK0qlTp+R2u61tOTk5atiwoUJDQ304Mv8SFRWl3Nxcj225ubnW7uXLtUdGRiosLEzBwcEe7W63W6dPn1ZkZGTND76eSU5O1sqVKzVnzhzdfffdkqiPr+Xm5mrr1q0e22655RYVFhYqMjKS2vjQe++9p61btyouLk5xcXF699139e677youLq5KfzdRUVGSZB1qu/S2P9SGgGQDbdu2ldPptBbNSVJaWprat2+vBg0oUW2JiYnR7t27rd3K0sU6xMTEWO1paWlWW15envbs2aOYmBg1aNBA7du392hPT0+X0+lUmzZtam8S9cDChQv15ptvav78+erXr5+1nfr41pEjRzRq1ChlZ2db23bt2qVrr71W8fHx1MaH/vjHP+rdd9/V5s2btXnzZvXo0UM9evTQ5s2bFRMToy+//FLmh69dNcboiy++uGxtjh07pmPHjikmJkZRUVGKjo72aE9LS1N0dHSpdUv1ko/PosMPnn32WdOvXz+TkZFhPvroI9OxY0fz4Ycf+npY9d6lp/m73W7Tt29f84c//MF8/fXXZsmSJSY2Nta6lktmZqZp3769WbJkiXUtl3vvvdc6fXbLli2mY8eO5qOPPjIZGRmmX79+Jjk52Wdzq4v2799v2rZta1544QVz4sQJjx/q41tut9sMHDjQDB8+3Ozbt898+umn5qc//alZtWoVtbGZp556yjp1/9y5c6ZLly4mOTnZ7Nu3zyQnJ5s777zTuiTDF198Ydq1a2c2bNhgXQfp8ccft/pasmSJSUxMNNu2bTPbtm0ziYmJZsWKFT6ZV20jINnEhQsXzIQJE0xsbKxJTEw0K1eu9PWQ/MKlAckYY7755hvzm9/8xtx+++2mX79+5h//+IfH4z/99FPTu3dv06FDBzN06FDrWiEllixZYn7yk5+Y+Ph4k5SUZPLz82tlHvXFkiVLTKtWrcr8MYb6+Nrx48fNyJEjTceOHc2dd95pXnnlFSvkUBv7uDQgGXPxYpADBgww7du3N4MGDTK7d+/2ePymTZvMXXfdZWJjY83IkSPNd999Z7W53W4zc+ZMk5CQYDp37mzmzJlj1by+cxjzw343AAAASGINEgAAQCkEJAAAAC8EJAAAAC8EJAAAAC8EJAAAAC8EJAAAAC8EJAAAAC8EJAAAAC8EJAA+deTIEbVu3VpHjhzx6That26t1NTUSj33z3/+s06ePFnNI7ro888/14EDB2qkbwCXR0ACgCrIysrSH/7wB+Xl5dVI/8OGDSv1besAah4BCQCqgG9rAuonAhIA2zh79qzGjx+vjh07KjExUcnJycrPz5ckpaamqkePHnr99dfVtWtXxcbGavz48XK5XOXq+/z580pKStJPfvIT3X777erTp4+2bt3q8ZgdO3aod+/eiomJ0e9//3udOXPGaps/f74SExPVoUMHPfzww9q3b58k6ec//7n1++2339bLL7+sJ554Qr/5zW/UqVMnbd++XdnZ2RozZozuuOMO3X777brvvvuUlpZm9X348GGNGDFCcXFx6tatm9asWSNJ6tGjhyRpyJAhevnllyv5rgKoDAISANuYNGmSzp07pzfeeEOLFy/WV199pWnTplntJ06c0Icffqhly5bp5Zdf1l/+8hdt3ry5XH3PmDFDhw4d0ooVK7RlyxYlJCRo0qRJHgFr3bp1mjRpktatW6dDhw5p1qxZkqSPPvpI69ev14svvqgtW7YoIiJCSUlJkqSNGzdav/v27StJ+vjjj9W/f3+tXr1aHTp00Lhx41RUVKQ333xTmzdvVlRUlKZOnSpJKigo0PDhw3XVVVdpw4YNmjx5sl544QV98skneuuttyRJL7/8soYPH16l9xZABRkA8KHMzEzTqlUrc/jwYdOmTRtz9uxZq23v3r3Wtm3btplWrVqZr7/+2mofOXKkeeaZZ8r1Ops2bTL//ve/rfsHDhwwrVq1MkePHjXGGNOqVSuzdu1aqz01NdXcdttt5ty5c2blypXmzjvvNFlZWcYYY06ePGl27NjhMf7MzExjjDELFiwwP/3pT61+iouLzapVq8yxY8esbX/7299MmzZtjDHGbN261cTGxppz585Z7W+99Zb59NNPrXFt27atXHMEUH2cvg5oACBJf/vb31RcXKyf/exnHtuLi4t1+PBh6/6NN95o3W7cuLHcbne5+h8wYIC2bt2qDRs26ODBg9q9e7ckqaioyHpM+/btrdu33Xab3G63vv32W/Xr109r167Vz3/+c8XGxqpnz54aNGjQZV/r+uuvt247HA49+OCDev/99/XFF1/o0KFD2rVrl4qLiyVJhw4dUosWLdS4cWPrOffff3+55gSg5hCQANiC2+1WkyZNtGnTplJtUVFRysjIkCQFBQV5tJlyLpKeMGGCvvzyS/3yl7/Ugw8+qMjISP3nf/6nx2MCAgJK9RsYGKjIyEj9+c9/1j/+8Q998sknWr58uTZs2HDZw3vBwcHW7eLiYg0fPlxnz55V37591aNHDxUWFmrUqFGSJKeT/wwDdsQaJAC20LVrV507d04Oh0M33nijbrzxRuXn5yslJaXcC7Ev5/z589qyZYteeOEFjRkzRr169bIWYF8asL7++mvr9s6dOxUYGKjmzZvr008/1caNG9WtWzc999xzeuedd/TNN9/o66+/lsPh+NHX3r9/v3bs2KFVq1bpt7/9rbp166YTJ05Yr33TTTfp8OHDHpcJmD17tqZPn16lOQOoGgISAFsIDg5W165dNW7cOO3cuVO7d+9WUlKSLly4oNDQ0Cr1HRQUpJCQEP3lL3/RkSNH9Pe//91a/H1p+HrhhRf0+eefKz09XdOnT9evf/1rhYSEqLi4WCkpKfroo4905MgRvf322woJCdFNN92kkJAQSdLevXv1/fffl3rt0NBQNWjQQO+9956ysrL0wQcfWGekuVwuJSYmKiIiQpMnT9aBAwf08ccf680331RiYqIkqVGjRtq3b5/OnTtXpfcAQMUQkADYRkpKipo3b65hw4bpkUceUYsWLTR//vwq9xsUFKQ5c+boww8/VL9+/fT888/rd7/7nSIjI/Wvf/3LetwjjzyiSZMm6ZFHHlFcXJzGjRsn6eLp9mPGjNGsWbN0zz336P3339fixYt19dVX69prr9UvfvEL/eEPf7DOaLvUddddp6lTp+q1115T//79tXTpUj3zzDNyOp3as2ePnE6nFi9erBMnTui+++7TjBkzNGHCBHXr1k2S9PDDDyslJYXT/IFa5jDlPYAPAADgJ9iDBAAA4IXTJwDUeTNmzLAuqliWxx9/XL/97W9rcUQA6joOsQGo87777rsfXcR89dVXKywsrPYGBKDOIyABAAB4YQ0SAACAFwISAACAFwISAACAFwISAACAFwISAACAFwISAACAFwISAACAFwISAACAl/8HLdLj2gnQFo4AAAAASUVORK5CYII=",
488
+ "text/plain": [
489
+ "<Figure size 640x480 with 1 Axes>"
490
+ ]
491
+ },
492
+ "metadata": {},
493
+ "output_type": "display_data"
494
+ }
495
+ ],
496
+ "source": [
497
+ "sns.histplot(x='len_abstract', data=df_cleaned)\n",
498
+ "plt.show()"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "markdown",
503
+ "metadata": {},
504
+ "source": [
505
+ "Filter data to only have records with absract leght between 446.00 and 908.00"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 21,
511
+ "metadata": {},
512
+ "outputs": [],
513
+ "source": [
514
+ "df = df_cleaned[df_cleaned['len_abstract'].between(446.00, 908.00)].reset_index(drop=True)"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": 22,
520
+ "metadata": {},
521
+ "outputs": [
522
+ {
523
+ "data": {
524
+ "text/html": [
525
+ "<div>\n",
526
+ "<style scoped>\n",
527
+ " .dataframe tbody tr th:only-of-type {\n",
528
+ " vertical-align: middle;\n",
529
+ " }\n",
530
+ "\n",
531
+ " .dataframe tbody tr th {\n",
532
+ " vertical-align: top;\n",
533
+ " }\n",
534
+ "\n",
535
+ " .dataframe thead th {\n",
536
+ " text-align: right;\n",
537
+ " }\n",
538
+ "</style>\n",
539
+ "<table border=\"1\" class=\"dataframe\">\n",
540
+ " <thead>\n",
541
+ " <tr style=\"text-align: right;\">\n",
542
+ " <th></th>\n",
543
+ " <th>id</th>\n",
544
+ " <th>submitter</th>\n",
545
+ " <th>authors</th>\n",
546
+ " <th>title</th>\n",
547
+ " <th>comments</th>\n",
548
+ " <th>journal-ref</th>\n",
549
+ " <th>doi</th>\n",
550
+ " <th>report-no</th>\n",
551
+ " <th>categories</th>\n",
552
+ " <th>license</th>\n",
553
+ " <th>abstract</th>\n",
554
+ " <th>versions</th>\n",
555
+ " <th>update_date</th>\n",
556
+ " <th>authors_parsed</th>\n",
557
+ " <th>cleaned_abstracts</th>\n",
558
+ " <th>len_abstract</th>\n",
559
+ " </tr>\n",
560
+ " </thead>\n",
561
+ " <tbody>\n",
562
+ " <tr>\n",
563
+ " <th>0</th>\n",
564
+ " <td>0704.0001</td>\n",
565
+ " <td>Pavel Nadolsky</td>\n",
566
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
567
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
568
+ " <td>37 pages, 15 figures; published version</td>\n",
569
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
570
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
571
+ " <td>ANL-HEP-PR-07-12</td>\n",
572
+ " <td>hep-ph</td>\n",
573
+ " <td>None</td>\n",
574
+ " <td>A fully differential calculation in perturba...</td>\n",
575
+ " <td>[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '...</td>\n",
576
+ " <td>2008-11-26</td>\n",
577
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
578
+ " <td>fully differential calculation perturbative...</td>\n",
579
+ " <td>695</td>\n",
580
+ " </tr>\n",
581
+ " <tr>\n",
582
+ " <th>1</th>\n",
583
+ " <td>0704.0002</td>\n",
584
+ " <td>Louis Theran</td>\n",
585
+ " <td>Ileana Streinu and Louis Theran</td>\n",
586
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
587
+ " <td>To appear in Graphs and Combinatorics</td>\n",
588
+ " <td>None</td>\n",
589
+ " <td>None</td>\n",
590
+ " <td>None</td>\n",
591
+ " <td>math.CO cs.CG</td>\n",
592
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
593
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
594
+ " <td>[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ...</td>\n",
595
+ " <td>2008-12-13</td>\n",
596
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
597
+ " <td>describe new algorithm $ k,\\ell)$-pebble ga...</td>\n",
598
+ " <td>619</td>\n",
599
+ " </tr>\n",
600
+ " <tr>\n",
601
+ " <th>2</th>\n",
602
+ " <td>0704.0003</td>\n",
603
+ " <td>Hongjun Pan</td>\n",
604
+ " <td>Hongjun Pan</td>\n",
605
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
606
+ " <td>23 pages, 3 figures</td>\n",
607
+ " <td>None</td>\n",
608
+ " <td>None</td>\n",
609
+ " <td>None</td>\n",
610
+ " <td>physics.gen-ph</td>\n",
611
+ " <td>None</td>\n",
612
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
613
+ " <td>[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '...</td>\n",
614
+ " <td>2008-01-13</td>\n",
615
+ " <td>[[Pan, Hongjun, ]]</td>\n",
616
+ " <td>evolution earth moon system describe dark m...</td>\n",
617
+ " <td>631</td>\n",
618
+ " </tr>\n",
619
+ " <tr>\n",
620
+ " <th>3</th>\n",
621
+ " <td>0704.0006</td>\n",
622
+ " <td>Yue Hin Pong</td>\n",
623
+ " <td>Y. H. Pong and C. K. Law</td>\n",
624
+ " <td>Bosonic characters of atomic Cooper pairs acro...</td>\n",
625
+ " <td>6 pages, 4 figures, accepted by PRA</td>\n",
626
+ " <td>None</td>\n",
627
+ " <td>10.1103/PhysRevA.75.043613</td>\n",
628
+ " <td>None</td>\n",
629
+ " <td>cond-mat.mes-hall</td>\n",
630
+ " <td>None</td>\n",
631
+ " <td>We study the two-particle wave function of p...</td>\n",
632
+ " <td>[{'created': 'Sat, 31 Mar 2007 04:24:59 GMT', ...</td>\n",
633
+ " <td>2015-05-13</td>\n",
634
+ " <td>[[Pong, Y. H., ], [Law, C. K., ]]</td>\n",
635
+ " <td>study particle wave function pair atom ferm...</td>\n",
636
+ " <td>638</td>\n",
637
+ " </tr>\n",
638
+ " <tr>\n",
639
+ " <th>4</th>\n",
640
+ " <td>0704.0007</td>\n",
641
+ " <td>Alejandro Corichi</td>\n",
642
+ " <td>Alejandro Corichi, Tatjana Vukasinac and Jose ...</td>\n",
643
+ " <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
644
+ " <td>16 pages, no figures. Typos corrected to match...</td>\n",
645
+ " <td>Phys.Rev.D76:044016,2007</td>\n",
646
+ " <td>10.1103/PhysRevD.76.044016</td>\n",
647
+ " <td>IGPG-07/03-2</td>\n",
648
+ " <td>gr-qc</td>\n",
649
+ " <td>None</td>\n",
650
+ " <td>A rather non-standard quantum representation...</td>\n",
651
+ " <td>[{'created': 'Sat, 31 Mar 2007 04:27:22 GMT', ...</td>\n",
652
+ " <td>2008-11-26</td>\n",
653
+ " <td>[[Corichi, Alejandro, ], [Vukasinac, Tatjana, ...</td>\n",
654
+ " <td>non standard quantum representation canonic...</td>\n",
655
+ " <td>734</td>\n",
656
+ " </tr>\n",
657
+ " <tr>\n",
658
+ " <th>...</th>\n",
659
+ " <td>...</td>\n",
660
+ " <td>...</td>\n",
661
+ " <td>...</td>\n",
662
+ " <td>...</td>\n",
663
+ " <td>...</td>\n",
664
+ " <td>...</td>\n",
665
+ " <td>...</td>\n",
666
+ " <td>...</td>\n",
667
+ " <td>...</td>\n",
668
+ " <td>...</td>\n",
669
+ " <td>...</td>\n",
670
+ " <td>...</td>\n",
671
+ " <td>...</td>\n",
672
+ " <td>...</td>\n",
673
+ " <td>...</td>\n",
674
+ " <td>...</td>\n",
675
+ " </tr>\n",
676
+ " <tr>\n",
677
+ " <th>1135476</th>\n",
678
+ " <td>supr-con/9608007</td>\n",
679
+ " <td>None</td>\n",
680
+ " <td>Francesca Federici, Andrei A. Varlamov</td>\n",
681
+ " <td>The Fluctuation Induced Pseudogap in the Infra...</td>\n",
682
+ " <td>8 pages, 4 eps figures, Submitted to Phys. Rev. B</td>\n",
683
+ " <td>None</td>\n",
684
+ " <td>10.1103/PhysRevB.55.6070</td>\n",
685
+ " <td>None</td>\n",
686
+ " <td>supr-con cond-mat.supr-con</td>\n",
687
+ " <td>None</td>\n",
688
+ " <td>We study the effect of fluctuations on the {...</td>\n",
689
+ " <td>[{'created': 'Fri, 23 Aug 1996 09:39:49 GMT', ...</td>\n",
690
+ " <td>2009-10-30</td>\n",
691
+ " <td>[[Federici, Francesca, ], [Varlamov, Andrei A....</td>\n",
692
+ " <td>study effect fluctuation \\bf ac conductivit...</td>\n",
693
+ " <td>703</td>\n",
694
+ " </tr>\n",
695
+ " <tr>\n",
696
+ " <th>1135477</th>\n",
697
+ " <td>supr-con/9609001</td>\n",
698
+ " <td>Durga P. Choudhury</td>\n",
699
+ " <td>Durga P. Choudhury, Balam A. Willemsen, John S...</td>\n",
700
+ " <td>Nonlinear Response of HTSC Thin Film Microwave...</td>\n",
701
+ " <td>4 pages, LaTeX type, Uses IEEE style files, 60...</td>\n",
702
+ " <td>None</td>\n",
703
+ " <td>10.1109/77.620744</td>\n",
704
+ " <td>None</td>\n",
705
+ " <td>supr-con cond-mat.supr-con</td>\n",
706
+ " <td>None</td>\n",
707
+ " <td>The non-linear microwave surface impedance o...</td>\n",
708
+ " <td>[{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ...</td>\n",
709
+ " <td>2016-11-18</td>\n",
710
+ " <td>[[Choudhury, Durga P., , Physics Department, N...</td>\n",
711
+ " <td>non linear microwave surface impedance patt...</td>\n",
712
+ " <td>468</td>\n",
713
+ " </tr>\n",
714
+ " <tr>\n",
715
+ " <th>1135478</th>\n",
716
+ " <td>supr-con/9609002</td>\n",
717
+ " <td>Durga P. Choudhury</td>\n",
718
+ " <td>Balam A. Willemsen, J. S. Derov and S.Sridhar ...</td>\n",
719
+ " <td>Critical State Flux Penetration and Linear Mic...</td>\n",
720
+ " <td>20 pages, LaTeX type, Uses REVTeX style files,...</td>\n",
721
+ " <td>None</td>\n",
722
+ " <td>10.1103/PhysRevB.56.11989</td>\n",
723
+ " <td>None</td>\n",
724
+ " <td>supr-con cond-mat.supr-con</td>\n",
725
+ " <td>None</td>\n",
726
+ " <td>The vortex contribution to the dc field (H) ...</td>\n",
727
+ " <td>[{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '...</td>\n",
728
+ " <td>2009-10-30</td>\n",
729
+ " <td>[[Willemsen, Balam A., , Physics Department,\\n...</td>\n",
730
+ " <td>vortex contribution dc field h dependent mi...</td>\n",
731
+ " <td>841</td>\n",
732
+ " </tr>\n",
733
+ " <tr>\n",
734
+ " <th>1135479</th>\n",
735
+ " <td>supr-con/9609003</td>\n",
736
+ " <td>Hasegawa Yasumasa</td>\n",
737
+ " <td>Yasumasa Hasegawa (Himeji Institute of Technol...</td>\n",
738
+ " <td>Density of States and NMR Relaxation Rate in A...</td>\n",
739
+ " <td>7 pages, 4 PostScript Figures, LaTeX, to appea...</td>\n",
740
+ " <td>None</td>\n",
741
+ " <td>10.1143/JPSJ.65.3131</td>\n",
742
+ " <td>None</td>\n",
743
+ " <td>supr-con cond-mat.supr-con</td>\n",
744
+ " <td>None</td>\n",
745
+ " <td>We show that the density of states in an ani...</td>\n",
746
+ " <td>[{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ...</td>\n",
747
+ " <td>2009-10-30</td>\n",
748
+ " <td>[[Hasegawa, Yasumasa, , Himeji Institute of Te...</td>\n",
749
+ " <td>density state anisotropic superconductor \\n...</td>\n",
750
+ " <td>449</td>\n",
751
+ " </tr>\n",
752
+ " <tr>\n",
753
+ " <th>1135480</th>\n",
754
+ " <td>supr-con/9609004</td>\n",
755
+ " <td>Masanori Ichioka</td>\n",
756
+ " <td>Naoki Enomoto, Masanori Ichioka and Kazushige ...</td>\n",
757
+ " <td>Ginzburg Landau theory for d-wave pairing and ...</td>\n",
758
+ " <td>12 pages including 8 eps figs, LaTeX with jpsj...</td>\n",
759
+ " <td>J. Phys. Soc. Jpn. 66, 204 (1997).</td>\n",
760
+ " <td>10.1143/JPSJ.66.204</td>\n",
761
+ " <td>None</td>\n",
762
+ " <td>supr-con cond-mat.supr-con</td>\n",
763
+ " <td>None</td>\n",
764
+ " <td>The Ginzburg Landau theory for d_{x^2-y^2}-w...</td>\n",
765
+ " <td>[{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ...</td>\n",
766
+ " <td>2009-10-30</td>\n",
767
+ " <td>[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ...</td>\n",
768
+ " <td>ginzburg landau theory d_{x^2 y^2}-wave sup...</td>\n",
769
+ " <td>528</td>\n",
770
+ " </tr>\n",
771
+ " </tbody>\n",
772
+ "</table>\n",
773
+ "<p>1135481 rows × 16 columns</p>\n",
774
+ "</div>"
775
+ ],
776
+ "text/plain": [
777
+ " id submitter \n",
778
+ "0 0704.0001 Pavel Nadolsky \\\n",
779
+ "1 0704.0002 Louis Theran \n",
780
+ "2 0704.0003 Hongjun Pan \n",
781
+ "3 0704.0006 Yue Hin Pong \n",
782
+ "4 0704.0007 Alejandro Corichi \n",
783
+ "... ... ... \n",
784
+ "1135476 supr-con/9608007 None \n",
785
+ "1135477 supr-con/9609001 Durga P. Choudhury \n",
786
+ "1135478 supr-con/9609002 Durga P. Choudhury \n",
787
+ "1135479 supr-con/9609003 Hasegawa Yasumasa \n",
788
+ "1135480 supr-con/9609004 Masanori Ichioka \n",
789
+ "\n",
790
+ " authors \n",
791
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n",
792
+ "1 Ileana Streinu and Louis Theran \n",
793
+ "2 Hongjun Pan \n",
794
+ "3 Y. H. Pong and C. K. Law \n",
795
+ "4 Alejandro Corichi, Tatjana Vukasinac and Jose ... \n",
796
+ "... ... \n",
797
+ "1135476 Francesca Federici, Andrei A. Varlamov \n",
798
+ "1135477 Durga P. Choudhury, Balam A. Willemsen, John S... \n",
799
+ "1135478 Balam A. Willemsen, J. S. Derov and S.Sridhar ... \n",
800
+ "1135479 Yasumasa Hasegawa (Himeji Institute of Technol... \n",
801
+ "1135480 Naoki Enomoto, Masanori Ichioka and Kazushige ... \n",
802
+ "\n",
803
+ " title \n",
804
+ "0 Calculation of prompt diphoton production cros... \\\n",
805
+ "1 Sparsity-certifying Graph Decompositions \n",
806
+ "2 The evolution of the Earth-Moon system based o... \n",
807
+ "3 Bosonic characters of atomic Cooper pairs acro... \n",
808
+ "4 Polymer Quantum Mechanics and its Continuum Limit \n",
809
+ "... ... \n",
810
+ "1135476 The Fluctuation Induced Pseudogap in the Infra... \n",
811
+ "1135477 Nonlinear Response of HTSC Thin Film Microwave... \n",
812
+ "1135478 Critical State Flux Penetration and Linear Mic... \n",
813
+ "1135479 Density of States and NMR Relaxation Rate in A... \n",
814
+ "1135480 Ginzburg Landau theory for d-wave pairing and ... \n",
815
+ "\n",
816
+ " comments \n",
817
+ "0 37 pages, 15 figures; published version \\\n",
818
+ "1 To appear in Graphs and Combinatorics \n",
819
+ "2 23 pages, 3 figures \n",
820
+ "3 6 pages, 4 figures, accepted by PRA \n",
821
+ "4 16 pages, no figures. Typos corrected to match... \n",
822
+ "... ... \n",
823
+ "1135476 8 pages, 4 eps figures, Submitted to Phys. Rev. B \n",
824
+ "1135477 4 pages, LaTeX type, Uses IEEE style files, 60... \n",
825
+ "1135478 20 pages, LaTeX type, Uses REVTeX style files,... \n",
826
+ "1135479 7 pages, 4 PostScript Figures, LaTeX, to appea... \n",
827
+ "1135480 12 pages including 8 eps figs, LaTeX with jpsj... \n",
828
+ "\n",
829
+ " journal-ref doi \n",
830
+ "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \\\n",
831
+ "1 None None \n",
832
+ "2 None None \n",
833
+ "3 None 10.1103/PhysRevA.75.043613 \n",
834
+ "4 Phys.Rev.D76:044016,2007 10.1103/PhysRevD.76.044016 \n",
835
+ "... ... ... \n",
836
+ "1135476 None 10.1103/PhysRevB.55.6070 \n",
837
+ "1135477 None 10.1109/77.620744 \n",
838
+ "1135478 None 10.1103/PhysRevB.56.11989 \n",
839
+ "1135479 None 10.1143/JPSJ.65.3131 \n",
840
+ "1135480 J. Phys. Soc. Jpn. 66, 204 (1997). 10.1143/JPSJ.66.204 \n",
841
+ "\n",
842
+ " report-no categories \n",
843
+ "0 ANL-HEP-PR-07-12 hep-ph \\\n",
844
+ "1 None math.CO cs.CG \n",
845
+ "2 None physics.gen-ph \n",
846
+ "3 None cond-mat.mes-hall \n",
847
+ "4 IGPG-07/03-2 gr-qc \n",
848
+ "... ... ... \n",
849
+ "1135476 None supr-con cond-mat.supr-con \n",
850
+ "1135477 None supr-con cond-mat.supr-con \n",
851
+ "1135478 None supr-con cond-mat.supr-con \n",
852
+ "1135479 None supr-con cond-mat.supr-con \n",
853
+ "1135480 None supr-con cond-mat.supr-con \n",
854
+ "\n",
855
+ " license \n",
856
+ "0 None \\\n",
857
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
858
+ "2 None \n",
859
+ "3 None \n",
860
+ "4 None \n",
861
+ "... ... \n",
862
+ "1135476 None \n",
863
+ "1135477 None \n",
864
+ "1135478 None \n",
865
+ "1135479 None \n",
866
+ "1135480 None \n",
867
+ "\n",
868
+ " abstract \n",
869
+ "0 A fully differential calculation in perturba... \\\n",
870
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
871
+ "2 The evolution of Earth-Moon system is descri... \n",
872
+ "3 We study the two-particle wave function of p... \n",
873
+ "4 A rather non-standard quantum representation... \n",
874
+ "... ... \n",
875
+ "1135476 We study the effect of fluctuations on the {... \n",
876
+ "1135477 The non-linear microwave surface impedance o... \n",
877
+ "1135478 The vortex contribution to the dc field (H) ... \n",
878
+ "1135479 We show that the density of states in an ani... \n",
879
+ "1135480 The Ginzburg Landau theory for d_{x^2-y^2}-w... \n",
880
+ "\n",
881
+ " versions update_date \n",
882
+ "0 [{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... 2008-11-26 \\\n",
883
+ "1 [{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... 2008-12-13 \n",
884
+ "2 [{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... 2008-01-13 \n",
885
+ "3 [{'created': 'Sat, 31 Mar 2007 04:24:59 GMT', ... 2015-05-13 \n",
886
+ "4 [{'created': 'Sat, 31 Mar 2007 04:27:22 GMT', ... 2008-11-26 \n",
887
+ "... ... ... \n",
888
+ "1135476 [{'created': 'Fri, 23 Aug 1996 09:39:49 GMT', ... 2009-10-30 \n",
889
+ "1135477 [{'created': 'Sat, 31 Aug 1996 17:34:38 GMT', ... 2016-11-18 \n",
890
+ "1135478 [{'created': 'Tue, 3 Sep 1996 14:08:26 GMT', '... 2009-10-30 \n",
891
+ "1135479 [{'created': 'Wed, 18 Sep 1996 07:57:29 GMT', ... 2009-10-30 \n",
892
+ "1135480 [{'created': 'Wed, 25 Sep 1996 14:17:09 GMT', ... 2009-10-30 \n",
893
+ "\n",
894
+ " authors_parsed \n",
895
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \\\n",
896
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
897
+ "2 [[Pan, Hongjun, ]] \n",
898
+ "3 [[Pong, Y. H., ], [Law, C. K., ]] \n",
899
+ "4 [[Corichi, Alejandro, ], [Vukasinac, Tatjana, ... \n",
900
+ "... ... \n",
901
+ "1135476 [[Federici, Francesca, ], [Varlamov, Andrei A.... \n",
902
+ "1135477 [[Choudhury, Durga P., , Physics Department, N... \n",
903
+ "1135478 [[Willemsen, Balam A., , Physics Department,\\n... \n",
904
+ "1135479 [[Hasegawa, Yasumasa, , Himeji Institute of Te... \n",
905
+ "1135480 [[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... \n",
906
+ "\n",
907
+ " cleaned_abstracts len_abstract \n",
908
+ "0 fully differential calculation perturbative... 695 \n",
909
+ "1 describe new algorithm $ k,\\ell)$-pebble ga... 619 \n",
910
+ "2 evolution earth moon system describe dark m... 631 \n",
911
+ "3 study particle wave function pair atom ferm... 638 \n",
912
+ "4 non standard quantum representation canonic... 734 \n",
913
+ "... ... ... \n",
914
+ "1135476 study effect fluctuation \\bf ac conductivit... 703 \n",
915
+ "1135477 non linear microwave surface impedance patt... 468 \n",
916
+ "1135478 vortex contribution dc field h dependent mi... 841 \n",
917
+ "1135479 density state anisotropic superconductor \\n... 449 \n",
918
+ "1135480 ginzburg landau theory d_{x^2 y^2}-wave sup... 528 \n",
919
+ "\n",
920
+ "[1135481 rows x 16 columns]"
921
+ ]
922
+ },
923
+ "execution_count": 22,
924
+ "metadata": {},
925
+ "output_type": "execute_result"
926
+ }
927
+ ],
928
+ "source": [
929
+ "df"
930
+ ]
931
+ },
932
+ {
933
+ "cell_type": "code",
934
+ "execution_count": 23,
935
+ "metadata": {},
936
+ "outputs": [
937
+ {
938
+ "data": {
939
+ "image/png": "",
940
+ "text/plain": [
941
+ "<Figure size 640x480 with 1 Axes>"
942
+ ]
943
+ },
944
+ "metadata": {},
945
+ "output_type": "display_data"
946
+ }
947
+ ],
948
+ "source": [
949
+ "sns.histplot(x='len_abstract', data=df)\n",
950
+ "plt.show()"
951
+ ]
952
+ },
953
+ {
954
+ "cell_type": "code",
955
+ "execution_count": null,
956
+ "metadata": {},
957
+ "outputs": [],
958
+ "source": [
959
+ "df.to_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\")"
960
+ ]
961
+ }
962
+ ],
963
+ "metadata": {
964
+ "kernelspec": {
965
+ "display_name": "Python 3.11.4 ('arxiv-env': venv)",
966
+ "language": "python",
967
+ "name": "python3"
968
+ },
969
+ "language_info": {
970
+ "codemirror_mode": {
971
+ "name": "ipython",
972
+ "version": 3
973
+ },
974
+ "file_extension": ".py",
975
+ "mimetype": "text/x-python",
976
+ "name": "python",
977
+ "nbconvert_exporter": "python",
978
+ "pygments_lexer": "ipython3",
979
+ "version": "3.11.4"
980
+ },
981
+ "orig_nbformat": 4,
982
+ "vscode": {
983
+ "interpreter": {
984
+ "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363"
985
+ }
986
+ }
987
+ },
988
+ "nbformat": 4,
989
+ "nbformat_minor": 2
990
+ }
notebooks/nlp_cleansing.ipynb ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "2023-06-24 16:49:13.031488: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
13
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import pandas as pd\n",
19
+ "import numpy\n",
20
+ "from matplotlib import pyplot as plt\n",
21
+ "from typing import List, Dict\n",
22
+ "from collections import Counter\n",
23
+ "from pprint import pprint\n",
24
+ "\n",
25
+ "import seaborn as sns\n",
26
+ "sns.set_style(\"darkgrid\")\n",
27
+ "sns.set_palette(\"mako\")\n",
28
+ "\n",
29
+ "import spacy\n",
30
+ "from spacy.lang.en import English\n",
31
+ "from nltk.corpus import stopwords\n",
32
+ "\n",
33
+ "nlp = spacy.load('en_core_web_sm')\n",
34
+ "\n",
35
+ "pd.set_option('display.float_format', '{:.2f}'.format)\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 32,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "data": {
45
+ "text/plain": [
46
+ "spacy.lang.en.English"
47
+ ]
48
+ },
49
+ "execution_count": 32,
50
+ "metadata": {},
51
+ "output_type": "execute_result"
52
+ }
53
+ ],
54
+ "source": [
55
+ "type(nlp)"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 30,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "df_raw = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers_raw.parquet.gzip\")"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 31,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "data": {
74
+ "text/html": [
75
+ "<div>\n",
76
+ "<style scoped>\n",
77
+ " .dataframe tbody tr th:only-of-type {\n",
78
+ " vertical-align: middle;\n",
79
+ " }\n",
80
+ "\n",
81
+ " .dataframe tbody tr th {\n",
82
+ " vertical-align: top;\n",
83
+ " }\n",
84
+ "\n",
85
+ " .dataframe thead th {\n",
86
+ " text-align: right;\n",
87
+ " }\n",
88
+ "</style>\n",
89
+ "<table border=\"1\" class=\"dataframe\">\n",
90
+ " <thead>\n",
91
+ " <tr style=\"text-align: right;\">\n",
92
+ " <th></th>\n",
93
+ " <th>id</th>\n",
94
+ " <th>submitter</th>\n",
95
+ " <th>authors</th>\n",
96
+ " <th>title</th>\n",
97
+ " <th>comments</th>\n",
98
+ " <th>journal-ref</th>\n",
99
+ " <th>doi</th>\n",
100
+ " <th>report-no</th>\n",
101
+ " <th>categories</th>\n",
102
+ " <th>license</th>\n",
103
+ " <th>abstract</th>\n",
104
+ " <th>versions</th>\n",
105
+ " <th>update_date</th>\n",
106
+ " <th>authors_parsed</th>\n",
107
+ " </tr>\n",
108
+ " </thead>\n",
109
+ " <tbody>\n",
110
+ " <tr>\n",
111
+ " <th>0</th>\n",
112
+ " <td>0704.0001</td>\n",
113
+ " <td>Pavel Nadolsky</td>\n",
114
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
115
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
116
+ " <td>37 pages, 15 figures; published version</td>\n",
117
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
118
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
119
+ " <td>ANL-HEP-PR-07-12</td>\n",
120
+ " <td>hep-ph</td>\n",
121
+ " <td>None</td>\n",
122
+ " <td>A fully differential calculation in perturba...</td>\n",
123
+ " <td>[{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '...</td>\n",
124
+ " <td>2008-11-26</td>\n",
125
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>1</th>\n",
129
+ " <td>0704.0002</td>\n",
130
+ " <td>Louis Theran</td>\n",
131
+ " <td>Ileana Streinu and Louis Theran</td>\n",
132
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
133
+ " <td>To appear in Graphs and Combinatorics</td>\n",
134
+ " <td>None</td>\n",
135
+ " <td>None</td>\n",
136
+ " <td>None</td>\n",
137
+ " <td>math.CO cs.CG</td>\n",
138
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
139
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
140
+ " <td>[{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ...</td>\n",
141
+ " <td>2008-12-13</td>\n",
142
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
143
+ " </tr>\n",
144
+ " <tr>\n",
145
+ " <th>2</th>\n",
146
+ " <td>0704.0003</td>\n",
147
+ " <td>Hongjun Pan</td>\n",
148
+ " <td>Hongjun Pan</td>\n",
149
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
150
+ " <td>23 pages, 3 figures</td>\n",
151
+ " <td>None</td>\n",
152
+ " <td>None</td>\n",
153
+ " <td>None</td>\n",
154
+ " <td>physics.gen-ph</td>\n",
155
+ " <td>None</td>\n",
156
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
157
+ " <td>[{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '...</td>\n",
158
+ " <td>2008-01-13</td>\n",
159
+ " <td>[[Pan, Hongjun, ]]</td>\n",
160
+ " </tr>\n",
161
+ " <tr>\n",
162
+ " <th>3</th>\n",
163
+ " <td>0704.0004</td>\n",
164
+ " <td>David Callan</td>\n",
165
+ " <td>David Callan</td>\n",
166
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
167
+ " <td>11 pages</td>\n",
168
+ " <td>None</td>\n",
169
+ " <td>None</td>\n",
170
+ " <td>None</td>\n",
171
+ " <td>math.CO</td>\n",
172
+ " <td>None</td>\n",
173
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
174
+ " <td>[{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ...</td>\n",
175
+ " <td>2007-05-23</td>\n",
176
+ " <td>[[Callan, David, ]]</td>\n",
177
+ " </tr>\n",
178
+ " <tr>\n",
179
+ " <th>4</th>\n",
180
+ " <td>0704.0005</td>\n",
181
+ " <td>Alberto Torchinsky</td>\n",
182
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
183
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
184
+ " <td>None</td>\n",
185
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
186
+ " <td>None</td>\n",
187
+ " <td>None</td>\n",
188
+ " <td>math.CA math.FA</td>\n",
189
+ " <td>None</td>\n",
190
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
191
+ " <td>[{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '...</td>\n",
192
+ " <td>2013-10-15</td>\n",
193
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
194
+ " </tr>\n",
195
+ " </tbody>\n",
196
+ "</table>\n",
197
+ "</div>"
198
+ ],
199
+ "text/plain": [
200
+ " id submitter \n",
201
+ "0 0704.0001 Pavel Nadolsky \\\n",
202
+ "1 0704.0002 Louis Theran \n",
203
+ "2 0704.0003 Hongjun Pan \n",
204
+ "3 0704.0004 David Callan \n",
205
+ "4 0704.0005 Alberto Torchinsky \n",
206
+ "\n",
207
+ " authors \n",
208
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n",
209
+ "1 Ileana Streinu and Louis Theran \n",
210
+ "2 Hongjun Pan \n",
211
+ "3 David Callan \n",
212
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
213
+ "\n",
214
+ " title \n",
215
+ "0 Calculation of prompt diphoton production cros... \\\n",
216
+ "1 Sparsity-certifying Graph Decompositions \n",
217
+ "2 The evolution of the Earth-Moon system based o... \n",
218
+ "3 A determinant of Stirling cycle numbers counts... \n",
219
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
220
+ "\n",
221
+ " comments \n",
222
+ "0 37 pages, 15 figures; published version \\\n",
223
+ "1 To appear in Graphs and Combinatorics \n",
224
+ "2 23 pages, 3 figures \n",
225
+ "3 11 pages \n",
226
+ "4 None \n",
227
+ "\n",
228
+ " journal-ref doi \n",
229
+ "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \\\n",
230
+ "1 None None \n",
231
+ "2 None None \n",
232
+ "3 None None \n",
233
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n",
234
+ "\n",
235
+ " report-no categories \n",
236
+ "0 ANL-HEP-PR-07-12 hep-ph \\\n",
237
+ "1 None math.CO cs.CG \n",
238
+ "2 None physics.gen-ph \n",
239
+ "3 None math.CO \n",
240
+ "4 None math.CA math.FA \n",
241
+ "\n",
242
+ " license \n",
243
+ "0 None \\\n",
244
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
245
+ "2 None \n",
246
+ "3 None \n",
247
+ "4 None \n",
248
+ "\n",
249
+ " abstract \n",
250
+ "0 A fully differential calculation in perturba... \\\n",
251
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
252
+ "2 The evolution of Earth-Moon system is descri... \n",
253
+ "3 We show that a determinant of Stirling cycle... \n",
254
+ "4 In this paper we show how to compute the $\\L... \n",
255
+ "\n",
256
+ " versions update_date \n",
257
+ "0 [{'created': 'Mon, 2 Apr 2007 19:18:42 GMT', '... 2008-11-26 \\\n",
258
+ "1 [{'created': 'Sat, 31 Mar 2007 02:26:18 GMT', ... 2008-12-13 \n",
259
+ "2 [{'created': 'Sun, 1 Apr 2007 20:46:54 GMT', '... 2008-01-13 \n",
260
+ "3 [{'created': 'Sat, 31 Mar 2007 03:16:14 GMT', ... 2007-05-23 \n",
261
+ "4 [{'created': 'Mon, 2 Apr 2007 18:09:58 GMT', '... 2013-10-15 \n",
262
+ "\n",
263
+ " authors_parsed \n",
264
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n",
265
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
266
+ "2 [[Pan, Hongjun, ]] \n",
267
+ "3 [[Callan, David, ]] \n",
268
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] "
269
+ ]
270
+ },
271
+ "execution_count": 31,
272
+ "metadata": {},
273
+ "output_type": "execute_result"
274
+ }
275
+ ],
276
+ "source": [
277
+ "df_raw.head()"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 3,
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": [
286
+ "def cleanData(doc: pd.Series, stemming=False, nlp = spacy.load('en_core_web_sm')):\n",
287
+ " \"\"\"\n",
288
+ " TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs.\n",
289
+ " \n",
290
+ " Cleans and processes the input documents by performing various text cleaning operations.\n",
291
+ "\n",
292
+ " Args:\n",
293
+ " doc (pd.Series): The documents to be cleaned, passed in a Series object.\n",
294
+ " stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False.\n",
295
+ "\n",
296
+ " Returns:\n",
297
+ " str: The cleaned and processed document as a single string.\n",
298
+ " \"\"\"\n",
299
+ " doc = doc.lower()\n",
300
+ " doc = nlp(doc)\n",
301
+ " tokens = [tokens.lower_ for tokens in doc]\n",
302
+ " tokens = [tokens for tokens in doc if (tokens.is_stop == False)]\n",
303
+ " tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]\n",
304
+ " final_token = [token.lemma_ for token in tokens]\n",
305
+ " \n",
306
+ " return \" \".join(final_token)"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 34,
312
+ "metadata": {},
313
+ "outputs": [
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ " In this paper we show how to compute the $\\Lambda_{\\alpha}$ norm, $\\alpha\\ge\n",
319
+ "0$, using the dyadic grid. This result is a consequence of the description of\n",
320
+ "the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.\n",
321
+ "\n"
322
+ ]
323
+ },
324
+ {
325
+ "data": {
326
+ "text/plain": [
327
+ "' paper compute $ \\\\lambda_{\\\\alpha}$ norm $ \\\\alpha\\\\ge \\n 0 $ dyadic grid result consequence description \\n hardy space $ h^p(r^n)$ term dyadic special atom \\n'"
328
+ ]
329
+ },
330
+ "execution_count": 34,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
+ }
334
+ ],
335
+ "source": [
336
+ "print(df_raw['abstract'][4])\n",
337
+ "test= cleanData(df_raw['abstract'][4])\n",
338
+ "test"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 35,
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "df_raw[\"cleaned_abstracts\"] = df_raw[\"abstract\"].map(lambda x: cleanData(x))"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 36,
353
+ "metadata": {},
354
+ "outputs": [],
355
+ "source": [
356
+ "\n",
357
+ "df_raw.to_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip\")"
358
+ ]
359
+ }
360
+ ],
361
+ "metadata": {
362
+ "kernelspec": {
363
+ "display_name": "Python 3.11.4 ('arxiv-env': venv)",
364
+ "language": "python",
365
+ "name": "python3"
366
+ },
367
+ "language_info": {
368
+ "codemirror_mode": {
369
+ "name": "ipython",
370
+ "version": 3
371
+ },
372
+ "file_extension": ".py",
373
+ "mimetype": "text/x-python",
374
+ "name": "python",
375
+ "nbconvert_exporter": "python",
376
+ "pygments_lexer": "ipython3",
377
+ "version": "3.11.4"
378
+ },
379
+ "orig_nbformat": 4,
380
+ "vscode": {
381
+ "interpreter": {
382
+ "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363"
383
+ }
384
+ }
385
+ },
386
+ "nbformat": 4,
387
+ "nbformat_minor": 2
388
+ }
notebooks/nlp_eda.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/read_raw_data.ipynb ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "CPU times: user 13 µs, sys: 0 ns, total: 13 µs\n",
13
+ "Wall time: 20.3 µs\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "%%time\n",
19
+ "import pandas as pd\n",
20
+ "import json\n",
21
+ "from tqdm import tqdm\n",
22
+ "import pyarrow as pa\n",
23
+ "import pyarrow.parquet as pq"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 8,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "2268252it [04:21, 8672.99it/s] \n"
36
+ ]
37
+ }
38
+ ],
39
+ "source": [
40
+ "# df = pd.read_json(\"../data/raw/arxiv-metadata-oai-snapshot.json\")\n",
41
+ "\n",
42
+ "# Initialize an empty list to store the data from the JSON file\n",
43
+ "arxiv_data = []\n",
44
+ "\n",
45
+ "# Open the JSON file using the 'with' statement, ensuring it's closed automatically\n",
46
+ "with open(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/raw/arxiv-metadata-oai-snapshot.json\", 'r') as file:\n",
47
+ " # Iterate over each line in the file\n",
48
+ " for line in tqdm(file):\n",
49
+ " # Load the JSON data from each line and append it to the arxiv_data list\n",
50
+ " arxiv_data.append(json.loads(line))\n",
51
+ "\n",
52
+ "# Create a DataFrame from the arxiv_data list using the pd.DataFrame.from_records() method\n",
53
+ "df = pd.DataFrame.from_records(arxiv_data)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 9,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/html": [
64
+ "<div>\n",
65
+ "<style scoped>\n",
66
+ " .dataframe tbody tr th:only-of-type {\n",
67
+ " vertical-align: middle;\n",
68
+ " }\n",
69
+ "\n",
70
+ " .dataframe tbody tr th {\n",
71
+ " vertical-align: top;\n",
72
+ " }\n",
73
+ "\n",
74
+ " .dataframe thead th {\n",
75
+ " text-align: right;\n",
76
+ " }\n",
77
+ "</style>\n",
78
+ "<table border=\"1\" class=\"dataframe\">\n",
79
+ " <thead>\n",
80
+ " <tr style=\"text-align: right;\">\n",
81
+ " <th></th>\n",
82
+ " <th>id</th>\n",
83
+ " <th>submitter</th>\n",
84
+ " <th>authors</th>\n",
85
+ " <th>title</th>\n",
86
+ " <th>comments</th>\n",
87
+ " <th>journal-ref</th>\n",
88
+ " <th>doi</th>\n",
89
+ " <th>report-no</th>\n",
90
+ " <th>categories</th>\n",
91
+ " <th>license</th>\n",
92
+ " <th>abstract</th>\n",
93
+ " <th>versions</th>\n",
94
+ " <th>update_date</th>\n",
95
+ " <th>authors_parsed</th>\n",
96
+ " </tr>\n",
97
+ " </thead>\n",
98
+ " <tbody>\n",
99
+ " <tr>\n",
100
+ " <th>0</th>\n",
101
+ " <td>0704.0001</td>\n",
102
+ " <td>Pavel Nadolsky</td>\n",
103
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
104
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
105
+ " <td>37 pages, 15 figures; published version</td>\n",
106
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
107
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
108
+ " <td>ANL-HEP-PR-07-12</td>\n",
109
+ " <td>hep-ph</td>\n",
110
+ " <td>None</td>\n",
111
+ " <td>A fully differential calculation in perturba...</td>\n",
112
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
113
+ " <td>2008-11-26</td>\n",
114
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
115
+ " </tr>\n",
116
+ " <tr>\n",
117
+ " <th>1</th>\n",
118
+ " <td>0704.0002</td>\n",
119
+ " <td>Louis Theran</td>\n",
120
+ " <td>Ileana Streinu and Louis Theran</td>\n",
121
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
122
+ " <td>To appear in Graphs and Combinatorics</td>\n",
123
+ " <td>None</td>\n",
124
+ " <td>None</td>\n",
125
+ " <td>None</td>\n",
126
+ " <td>math.CO cs.CG</td>\n",
127
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
128
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
129
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
130
+ " <td>2008-12-13</td>\n",
131
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
132
+ " </tr>\n",
133
+ " <tr>\n",
134
+ " <th>2</th>\n",
135
+ " <td>0704.0003</td>\n",
136
+ " <td>Hongjun Pan</td>\n",
137
+ " <td>Hongjun Pan</td>\n",
138
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
139
+ " <td>23 pages, 3 figures</td>\n",
140
+ " <td>None</td>\n",
141
+ " <td>None</td>\n",
142
+ " <td>None</td>\n",
143
+ " <td>physics.gen-ph</td>\n",
144
+ " <td>None</td>\n",
145
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
146
+ " <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
147
+ " <td>2008-01-13</td>\n",
148
+ " <td>[[Pan, Hongjun, ]]</td>\n",
149
+ " </tr>\n",
150
+ " <tr>\n",
151
+ " <th>3</th>\n",
152
+ " <td>0704.0004</td>\n",
153
+ " <td>David Callan</td>\n",
154
+ " <td>David Callan</td>\n",
155
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
156
+ " <td>11 pages</td>\n",
157
+ " <td>None</td>\n",
158
+ " <td>None</td>\n",
159
+ " <td>None</td>\n",
160
+ " <td>math.CO</td>\n",
161
+ " <td>None</td>\n",
162
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
163
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
164
+ " <td>2007-05-23</td>\n",
165
+ " <td>[[Callan, David, ]]</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>4</th>\n",
169
+ " <td>0704.0005</td>\n",
170
+ " <td>Alberto Torchinsky</td>\n",
171
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
172
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
173
+ " <td>None</td>\n",
174
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
175
+ " <td>None</td>\n",
176
+ " <td>None</td>\n",
177
+ " <td>math.CA math.FA</td>\n",
178
+ " <td>None</td>\n",
179
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
180
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
181
+ " <td>2013-10-15</td>\n",
182
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
183
+ " </tr>\n",
184
+ " <tr>\n",
185
+ " <th>...</th>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>2268247</th>\n",
203
+ " <td>supr-con/9608008</td>\n",
204
+ " <td>Ruslan Prozorov</td>\n",
205
+ " <td>R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...</td>\n",
206
+ " <td>On the origin of the irreversibility line in t...</td>\n",
207
+ " <td>19 pages, LaTex, 6 PostScript figures; Author'...</td>\n",
208
+ " <td>None</td>\n",
209
+ " <td>10.1103/PhysRevB.54.15530</td>\n",
210
+ " <td>None</td>\n",
211
+ " <td>supr-con cond-mat.supr-con</td>\n",
212
+ " <td>None</td>\n",
213
+ " <td>We report on measurements of the angular dep...</td>\n",
214
+ " <td>[{'version': 'v1', 'created': 'Mon, 26 Aug 199...</td>\n",
215
+ " <td>2009-10-30</td>\n",
216
+ " <td>[[Prozorov, R., ], [Konczykowski, M., ], [Schm...</td>\n",
217
+ " </tr>\n",
218
+ " <tr>\n",
219
+ " <th>2268248</th>\n",
220
+ " <td>supr-con/9609001</td>\n",
221
+ " <td>Durga P. Choudhury</td>\n",
222
+ " <td>Durga P. Choudhury, Balam A. Willemsen, John S...</td>\n",
223
+ " <td>Nonlinear Response of HTSC Thin Film Microwave...</td>\n",
224
+ " <td>4 pages, LaTeX type, Uses IEEE style files, 60...</td>\n",
225
+ " <td>None</td>\n",
226
+ " <td>10.1109/77.620744</td>\n",
227
+ " <td>None</td>\n",
228
+ " <td>supr-con cond-mat.supr-con</td>\n",
229
+ " <td>None</td>\n",
230
+ " <td>The non-linear microwave surface impedance o...</td>\n",
231
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Aug 199...</td>\n",
232
+ " <td>2016-11-18</td>\n",
233
+ " <td>[[Choudhury, Durga P., , Physics Department, N...</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>2268249</th>\n",
237
+ " <td>supr-con/9609002</td>\n",
238
+ " <td>Durga P. Choudhury</td>\n",
239
+ " <td>Balam A. Willemsen, J. S. Derov and S.Sridhar ...</td>\n",
240
+ " <td>Critical State Flux Penetration and Linear Mic...</td>\n",
241
+ " <td>20 pages, LaTeX type, Uses REVTeX style files,...</td>\n",
242
+ " <td>None</td>\n",
243
+ " <td>10.1103/PhysRevB.56.11989</td>\n",
244
+ " <td>None</td>\n",
245
+ " <td>supr-con cond-mat.supr-con</td>\n",
246
+ " <td>None</td>\n",
247
+ " <td>The vortex contribution to the dc field (H) ...</td>\n",
248
+ " <td>[{'version': 'v1', 'created': 'Tue, 3 Sep 1996...</td>\n",
249
+ " <td>2009-10-30</td>\n",
250
+ " <td>[[Willemsen, Balam A., , Physics Department,\\n...</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>2268250</th>\n",
254
+ " <td>supr-con/9609003</td>\n",
255
+ " <td>Hasegawa Yasumasa</td>\n",
256
+ " <td>Yasumasa Hasegawa (Himeji Institute of Technol...</td>\n",
257
+ " <td>Density of States and NMR Relaxation Rate in A...</td>\n",
258
+ " <td>7 pages, 4 PostScript Figures, LaTeX, to appea...</td>\n",
259
+ " <td>None</td>\n",
260
+ " <td>10.1143/JPSJ.65.3131</td>\n",
261
+ " <td>None</td>\n",
262
+ " <td>supr-con cond-mat.supr-con</td>\n",
263
+ " <td>None</td>\n",
264
+ " <td>We show that the density of states in an ani...</td>\n",
265
+ " <td>[{'version': 'v1', 'created': 'Wed, 18 Sep 199...</td>\n",
266
+ " <td>2009-10-30</td>\n",
267
+ " <td>[[Hasegawa, Yasumasa, , Himeji Institute of Te...</td>\n",
268
+ " </tr>\n",
269
+ " <tr>\n",
270
+ " <th>2268251</th>\n",
271
+ " <td>supr-con/9609004</td>\n",
272
+ " <td>Masanori Ichioka</td>\n",
273
+ " <td>Naoki Enomoto, Masanori Ichioka and Kazushige ...</td>\n",
274
+ " <td>Ginzburg Landau theory for d-wave pairing and ...</td>\n",
275
+ " <td>12 pages including 8 eps figs, LaTeX with jpsj...</td>\n",
276
+ " <td>J. Phys. Soc. Jpn. 66, 204 (1997).</td>\n",
277
+ " <td>10.1143/JPSJ.66.204</td>\n",
278
+ " <td>None</td>\n",
279
+ " <td>supr-con cond-mat.supr-con</td>\n",
280
+ " <td>None</td>\n",
281
+ " <td>The Ginzburg Landau theory for d_{x^2-y^2}-w...</td>\n",
282
+ " <td>[{'version': 'v1', 'created': 'Wed, 25 Sep 199...</td>\n",
283
+ " <td>2009-10-30</td>\n",
284
+ " <td>[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ...</td>\n",
285
+ " </tr>\n",
286
+ " </tbody>\n",
287
+ "</table>\n",
288
+ "<p>2268252 rows × 14 columns</p>\n",
289
+ "</div>"
290
+ ],
291
+ "text/plain": [
292
+ " id submitter \n",
293
+ "0 0704.0001 Pavel Nadolsky \\\n",
294
+ "1 0704.0002 Louis Theran \n",
295
+ "2 0704.0003 Hongjun Pan \n",
296
+ "3 0704.0004 David Callan \n",
297
+ "4 0704.0005 Alberto Torchinsky \n",
298
+ "... ... ... \n",
299
+ "2268247 supr-con/9608008 Ruslan Prozorov \n",
300
+ "2268248 supr-con/9609001 Durga P. Choudhury \n",
301
+ "2268249 supr-con/9609002 Durga P. Choudhury \n",
302
+ "2268250 supr-con/9609003 Hasegawa Yasumasa \n",
303
+ "2268251 supr-con/9609004 Masanori Ichioka \n",
304
+ "\n",
305
+ " authors \n",
306
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n",
307
+ "1 Ileana Streinu and Louis Theran \n",
308
+ "2 Hongjun Pan \n",
309
+ "3 David Callan \n",
310
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
311
+ "... ... \n",
312
+ "2268247 R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... \n",
313
+ "2268248 Durga P. Choudhury, Balam A. Willemsen, John S... \n",
314
+ "2268249 Balam A. Willemsen, J. S. Derov and S.Sridhar ... \n",
315
+ "2268250 Yasumasa Hasegawa (Himeji Institute of Technol... \n",
316
+ "2268251 Naoki Enomoto, Masanori Ichioka and Kazushige ... \n",
317
+ "\n",
318
+ " title \n",
319
+ "0 Calculation of prompt diphoton production cros... \\\n",
320
+ "1 Sparsity-certifying Graph Decompositions \n",
321
+ "2 The evolution of the Earth-Moon system based o... \n",
322
+ "3 A determinant of Stirling cycle numbers counts... \n",
323
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
324
+ "... ... \n",
325
+ "2268247 On the origin of the irreversibility line in t... \n",
326
+ "2268248 Nonlinear Response of HTSC Thin Film Microwave... \n",
327
+ "2268249 Critical State Flux Penetration and Linear Mic... \n",
328
+ "2268250 Density of States and NMR Relaxation Rate in A... \n",
329
+ "2268251 Ginzburg Landau theory for d-wave pairing and ... \n",
330
+ "\n",
331
+ " comments \n",
332
+ "0 37 pages, 15 figures; published version \\\n",
333
+ "1 To appear in Graphs and Combinatorics \n",
334
+ "2 23 pages, 3 figures \n",
335
+ "3 11 pages \n",
336
+ "4 None \n",
337
+ "... ... \n",
338
+ "2268247 19 pages, LaTex, 6 PostScript figures; Author'... \n",
339
+ "2268248 4 pages, LaTeX type, Uses IEEE style files, 60... \n",
340
+ "2268249 20 pages, LaTeX type, Uses REVTeX style files,... \n",
341
+ "2268250 7 pages, 4 PostScript Figures, LaTeX, to appea... \n",
342
+ "2268251 12 pages including 8 eps figs, LaTeX with jpsj... \n",
343
+ "\n",
344
+ " journal-ref \n",
345
+ "0 Phys.Rev.D76:013009,2007 \\\n",
346
+ "1 None \n",
347
+ "2 None \n",
348
+ "3 None \n",
349
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 \n",
350
+ "... ... \n",
351
+ "2268247 None \n",
352
+ "2268248 None \n",
353
+ "2268249 None \n",
354
+ "2268250 None \n",
355
+ "2268251 J. Phys. Soc. Jpn. 66, 204 (1997). \n",
356
+ "\n",
357
+ " doi report-no \n",
358
+ "0 10.1103/PhysRevD.76.013009 ANL-HEP-PR-07-12 \\\n",
359
+ "1 None None \n",
360
+ "2 None None \n",
361
+ "3 None None \n",
362
+ "4 None None \n",
363
+ "... ... ... \n",
364
+ "2268247 10.1103/PhysRevB.54.15530 None \n",
365
+ "2268248 10.1109/77.620744 None \n",
366
+ "2268249 10.1103/PhysRevB.56.11989 None \n",
367
+ "2268250 10.1143/JPSJ.65.3131 None \n",
368
+ "2268251 10.1143/JPSJ.66.204 None \n",
369
+ "\n",
370
+ " categories \n",
371
+ "0 hep-ph \\\n",
372
+ "1 math.CO cs.CG \n",
373
+ "2 physics.gen-ph \n",
374
+ "3 math.CO \n",
375
+ "4 math.CA math.FA \n",
376
+ "... ... \n",
377
+ "2268247 supr-con cond-mat.supr-con \n",
378
+ "2268248 supr-con cond-mat.supr-con \n",
379
+ "2268249 supr-con cond-mat.supr-con \n",
380
+ "2268250 supr-con cond-mat.supr-con \n",
381
+ "2268251 supr-con cond-mat.supr-con \n",
382
+ "\n",
383
+ " license \n",
384
+ "0 None \\\n",
385
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
386
+ "2 None \n",
387
+ "3 None \n",
388
+ "4 None \n",
389
+ "... ... \n",
390
+ "2268247 None \n",
391
+ "2268248 None \n",
392
+ "2268249 None \n",
393
+ "2268250 None \n",
394
+ "2268251 None \n",
395
+ "\n",
396
+ " abstract \n",
397
+ "0 A fully differential calculation in perturba... \\\n",
398
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
399
+ "2 The evolution of Earth-Moon system is descri... \n",
400
+ "3 We show that a determinant of Stirling cycle... \n",
401
+ "4 In this paper we show how to compute the $\\L... \n",
402
+ "... ... \n",
403
+ "2268247 We report on measurements of the angular dep... \n",
404
+ "2268248 The non-linear microwave surface impedance o... \n",
405
+ "2268249 The vortex contribution to the dc field (H) ... \n",
406
+ "2268250 We show that the density of states in an ani... \n",
407
+ "2268251 The Ginzburg Landau theory for d_{x^2-y^2}-w... \n",
408
+ "\n",
409
+ " versions update_date \n",
410
+ "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \\\n",
411
+ "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n",
412
+ "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n",
413
+ "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n",
414
+ "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n",
415
+ "... ... ... \n",
416
+ "2268247 [{'version': 'v1', 'created': 'Mon, 26 Aug 199... 2009-10-30 \n",
417
+ "2268248 [{'version': 'v1', 'created': 'Sat, 31 Aug 199... 2016-11-18 \n",
418
+ "2268249 [{'version': 'v1', 'created': 'Tue, 3 Sep 1996... 2009-10-30 \n",
419
+ "2268250 [{'version': 'v1', 'created': 'Wed, 18 Sep 199... 2009-10-30 \n",
420
+ "2268251 [{'version': 'v1', 'created': 'Wed, 25 Sep 199... 2009-10-30 \n",
421
+ "\n",
422
+ " authors_parsed \n",
423
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n",
424
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
425
+ "2 [[Pan, Hongjun, ]] \n",
426
+ "3 [[Callan, David, ]] \n",
427
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] \n",
428
+ "... ... \n",
429
+ "2268247 [[Prozorov, R., ], [Konczykowski, M., ], [Schm... \n",
430
+ "2268248 [[Choudhury, Durga P., , Physics Department, N... \n",
431
+ "2268249 [[Willemsen, Balam A., , Physics Department,\\n... \n",
432
+ "2268250 [[Hasegawa, Yasumasa, , Himeji Institute of Te... \n",
433
+ "2268251 [[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... \n",
434
+ "\n",
435
+ "[2268252 rows x 14 columns]"
436
+ ]
437
+ },
438
+ "execution_count": 9,
439
+ "metadata": {},
440
+ "output_type": "execute_result"
441
+ }
442
+ ],
443
+ "source": [
444
+ "df"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 10,
450
+ "metadata": {},
451
+ "outputs": [],
452
+ "source": [
453
+ "df.to_parquet('/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers_raw.parquet.gzip', compression='gzip')"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 13,
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "data": {
463
+ "text/html": [
464
+ "<div>\n",
465
+ "<style scoped>\n",
466
+ " .dataframe tbody tr th:only-of-type {\n",
467
+ " vertical-align: middle;\n",
468
+ " }\n",
469
+ "\n",
470
+ " .dataframe tbody tr th {\n",
471
+ " vertical-align: top;\n",
472
+ " }\n",
473
+ "\n",
474
+ " .dataframe thead th {\n",
475
+ " text-align: right;\n",
476
+ " }\n",
477
+ "</style>\n",
478
+ "<table border=\"1\" class=\"dataframe\">\n",
479
+ " <thead>\n",
480
+ " <tr style=\"text-align: right;\">\n",
481
+ " <th></th>\n",
482
+ " <th>id</th>\n",
483
+ " <th>submitter</th>\n",
484
+ " <th>authors</th>\n",
485
+ " <th>title</th>\n",
486
+ " <th>comments</th>\n",
487
+ " <th>journal-ref</th>\n",
488
+ " <th>doi</th>\n",
489
+ " <th>report-no</th>\n",
490
+ " <th>categories</th>\n",
491
+ " <th>license</th>\n",
492
+ " <th>abstract</th>\n",
493
+ " <th>versions</th>\n",
494
+ " <th>update_date</th>\n",
495
+ " <th>authors_parsed</th>\n",
496
+ " </tr>\n",
497
+ " </thead>\n",
498
+ " <tbody>\n",
499
+ " <tr>\n",
500
+ " <th>0</th>\n",
501
+ " <td>0704.0001</td>\n",
502
+ " <td>Pavel Nadolsky</td>\n",
503
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
504
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
505
+ " <td>37 pages, 15 figures; published version</td>\n",
506
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
507
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
508
+ " <td>ANL-HEP-PR-07-12</td>\n",
509
+ " <td>hep-ph</td>\n",
510
+ " <td>None</td>\n",
511
+ " <td>A fully differential calculation in perturba...</td>\n",
512
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
513
+ " <td>2008-11-26</td>\n",
514
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>1</th>\n",
518
+ " <td>0704.0002</td>\n",
519
+ " <td>Louis Theran</td>\n",
520
+ " <td>Ileana Streinu and Louis Theran</td>\n",
521
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
522
+ " <td>To appear in Graphs and Combinatorics</td>\n",
523
+ " <td>None</td>\n",
524
+ " <td>None</td>\n",
525
+ " <td>None</td>\n",
526
+ " <td>math.CO cs.CG</td>\n",
527
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
528
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
529
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
530
+ " <td>2008-12-13</td>\n",
531
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
532
+ " </tr>\n",
533
+ " <tr>\n",
534
+ " <th>2</th>\n",
535
+ " <td>0704.0003</td>\n",
536
+ " <td>Hongjun Pan</td>\n",
537
+ " <td>Hongjun Pan</td>\n",
538
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
539
+ " <td>23 pages, 3 figures</td>\n",
540
+ " <td>None</td>\n",
541
+ " <td>None</td>\n",
542
+ " <td>None</td>\n",
543
+ " <td>physics.gen-ph</td>\n",
544
+ " <td>None</td>\n",
545
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
546
+ " <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
547
+ " <td>2008-01-13</td>\n",
548
+ " <td>[[Pan, Hongjun, ]]</td>\n",
549
+ " </tr>\n",
550
+ " <tr>\n",
551
+ " <th>3</th>\n",
552
+ " <td>0704.0004</td>\n",
553
+ " <td>David Callan</td>\n",
554
+ " <td>David Callan</td>\n",
555
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
556
+ " <td>11 pages</td>\n",
557
+ " <td>None</td>\n",
558
+ " <td>None</td>\n",
559
+ " <td>None</td>\n",
560
+ " <td>math.CO</td>\n",
561
+ " <td>None</td>\n",
562
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
563
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
564
+ " <td>2007-05-23</td>\n",
565
+ " <td>[[Callan, David, ]]</td>\n",
566
+ " </tr>\n",
567
+ " <tr>\n",
568
+ " <th>4</th>\n",
569
+ " <td>0704.0005</td>\n",
570
+ " <td>Alberto Torchinsky</td>\n",
571
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
572
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
573
+ " <td>None</td>\n",
574
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
575
+ " <td>None</td>\n",
576
+ " <td>None</td>\n",
577
+ " <td>math.CA math.FA</td>\n",
578
+ " <td>None</td>\n",
579
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
580
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
581
+ " <td>2013-10-15</td>\n",
582
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
583
+ " </tr>\n",
584
+ " <tr>\n",
585
+ " <th>...</th>\n",
586
+ " <td>...</td>\n",
587
+ " <td>...</td>\n",
588
+ " <td>...</td>\n",
589
+ " <td>...</td>\n",
590
+ " <td>...</td>\n",
591
+ " <td>...</td>\n",
592
+ " <td>...</td>\n",
593
+ " <td>...</td>\n",
594
+ " <td>...</td>\n",
595
+ " <td>...</td>\n",
596
+ " <td>...</td>\n",
597
+ " <td>...</td>\n",
598
+ " <td>...</td>\n",
599
+ " <td>...</td>\n",
600
+ " </tr>\n",
601
+ " <tr>\n",
602
+ " <th>2268247</th>\n",
603
+ " <td>supr-con/9608008</td>\n",
604
+ " <td>Ruslan Prozorov</td>\n",
605
+ " <td>R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...</td>\n",
606
+ " <td>On the origin of the irreversibility line in t...</td>\n",
607
+ " <td>19 pages, LaTex, 6 PostScript figures; Author'...</td>\n",
608
+ " <td>None</td>\n",
609
+ " <td>10.1103/PhysRevB.54.15530</td>\n",
610
+ " <td>None</td>\n",
611
+ " <td>supr-con cond-mat.supr-con</td>\n",
612
+ " <td>None</td>\n",
613
+ " <td>We report on measurements of the angular dep...</td>\n",
614
+ " <td>[{'version': 'v1', 'created': 'Mon, 26 Aug 199...</td>\n",
615
+ " <td>2009-10-30</td>\n",
616
+ " <td>[[Prozorov, R., ], [Konczykowski, M., ], [Schm...</td>\n",
617
+ " </tr>\n",
618
+ " <tr>\n",
619
+ " <th>2268248</th>\n",
620
+ " <td>supr-con/9609001</td>\n",
621
+ " <td>Durga P. Choudhury</td>\n",
622
+ " <td>Durga P. Choudhury, Balam A. Willemsen, John S...</td>\n",
623
+ " <td>Nonlinear Response of HTSC Thin Film Microwave...</td>\n",
624
+ " <td>4 pages, LaTeX type, Uses IEEE style files, 60...</td>\n",
625
+ " <td>None</td>\n",
626
+ " <td>10.1109/77.620744</td>\n",
627
+ " <td>None</td>\n",
628
+ " <td>supr-con cond-mat.supr-con</td>\n",
629
+ " <td>None</td>\n",
630
+ " <td>The non-linear microwave surface impedance o...</td>\n",
631
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Aug 199...</td>\n",
632
+ " <td>2016-11-18</td>\n",
633
+ " <td>[[Choudhury, Durga P., , Physics Department, N...</td>\n",
634
+ " </tr>\n",
635
+ " <tr>\n",
636
+ " <th>2268249</th>\n",
637
+ " <td>supr-con/9609002</td>\n",
638
+ " <td>Durga P. Choudhury</td>\n",
639
+ " <td>Balam A. Willemsen, J. S. Derov and S.Sridhar ...</td>\n",
640
+ " <td>Critical State Flux Penetration and Linear Mic...</td>\n",
641
+ " <td>20 pages, LaTeX type, Uses REVTeX style files,...</td>\n",
642
+ " <td>None</td>\n",
643
+ " <td>10.1103/PhysRevB.56.11989</td>\n",
644
+ " <td>None</td>\n",
645
+ " <td>supr-con cond-mat.supr-con</td>\n",
646
+ " <td>None</td>\n",
647
+ " <td>The vortex contribution to the dc field (H) ...</td>\n",
648
+ " <td>[{'version': 'v1', 'created': 'Tue, 3 Sep 1996...</td>\n",
649
+ " <td>2009-10-30</td>\n",
650
+ " <td>[[Willemsen, Balam A., , Physics Department,\\n...</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <th>2268250</th>\n",
654
+ " <td>supr-con/9609003</td>\n",
655
+ " <td>Hasegawa Yasumasa</td>\n",
656
+ " <td>Yasumasa Hasegawa (Himeji Institute of Technol...</td>\n",
657
+ " <td>Density of States and NMR Relaxation Rate in A...</td>\n",
658
+ " <td>7 pages, 4 PostScript Figures, LaTeX, to appea...</td>\n",
659
+ " <td>None</td>\n",
660
+ " <td>10.1143/JPSJ.65.3131</td>\n",
661
+ " <td>None</td>\n",
662
+ " <td>supr-con cond-mat.supr-con</td>\n",
663
+ " <td>None</td>\n",
664
+ " <td>We show that the density of states in an ani...</td>\n",
665
+ " <td>[{'version': 'v1', 'created': 'Wed, 18 Sep 199...</td>\n",
666
+ " <td>2009-10-30</td>\n",
667
+ " <td>[[Hasegawa, Yasumasa, , Himeji Institute of Te...</td>\n",
668
+ " </tr>\n",
669
+ " <tr>\n",
670
+ " <th>2268251</th>\n",
671
+ " <td>supr-con/9609004</td>\n",
672
+ " <td>Masanori Ichioka</td>\n",
673
+ " <td>Naoki Enomoto, Masanori Ichioka and Kazushige ...</td>\n",
674
+ " <td>Ginzburg Landau theory for d-wave pairing and ...</td>\n",
675
+ " <td>12 pages including 8 eps figs, LaTeX with jpsj...</td>\n",
676
+ " <td>J. Phys. Soc. Jpn. 66, 204 (1997).</td>\n",
677
+ " <td>10.1143/JPSJ.66.204</td>\n",
678
+ " <td>None</td>\n",
679
+ " <td>supr-con cond-mat.supr-con</td>\n",
680
+ " <td>None</td>\n",
681
+ " <td>The Ginzburg Landau theory for d_{x^2-y^2}-w...</td>\n",
682
+ " <td>[{'version': 'v1', 'created': 'Wed, 25 Sep 199...</td>\n",
683
+ " <td>2009-10-30</td>\n",
684
+ " <td>[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ...</td>\n",
685
+ " </tr>\n",
686
+ " </tbody>\n",
687
+ "</table>\n",
688
+ "<p>2268252 rows × 14 columns</p>\n",
689
+ "</div>"
690
+ ],
691
+ "text/plain": [
692
+ " id submitter \n",
693
+ "0 0704.0001 Pavel Nadolsky \\\n",
694
+ "1 0704.0002 Louis Theran \n",
695
+ "2 0704.0003 Hongjun Pan \n",
696
+ "3 0704.0004 David Callan \n",
697
+ "4 0704.0005 Alberto Torchinsky \n",
698
+ "... ... ... \n",
699
+ "2268247 supr-con/9608008 Ruslan Prozorov \n",
700
+ "2268248 supr-con/9609001 Durga P. Choudhury \n",
701
+ "2268249 supr-con/9609002 Durga P. Choudhury \n",
702
+ "2268250 supr-con/9609003 Hasegawa Yasumasa \n",
703
+ "2268251 supr-con/9609004 Masanori Ichioka \n",
704
+ "\n",
705
+ " authors \n",
706
+ "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \\\n",
707
+ "1 Ileana Streinu and Louis Theran \n",
708
+ "2 Hongjun Pan \n",
709
+ "3 David Callan \n",
710
+ "4 Wael Abu-Shammala and Alberto Torchinsky \n",
711
+ "... ... \n",
712
+ "2268247 R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... \n",
713
+ "2268248 Durga P. Choudhury, Balam A. Willemsen, John S... \n",
714
+ "2268249 Balam A. Willemsen, J. S. Derov and S.Sridhar ... \n",
715
+ "2268250 Yasumasa Hasegawa (Himeji Institute of Technol... \n",
716
+ "2268251 Naoki Enomoto, Masanori Ichioka and Kazushige ... \n",
717
+ "\n",
718
+ " title \n",
719
+ "0 Calculation of prompt diphoton production cros... \\\n",
720
+ "1 Sparsity-certifying Graph Decompositions \n",
721
+ "2 The evolution of the Earth-Moon system based o... \n",
722
+ "3 A determinant of Stirling cycle numbers counts... \n",
723
+ "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
724
+ "... ... \n",
725
+ "2268247 On the origin of the irreversibility line in t... \n",
726
+ "2268248 Nonlinear Response of HTSC Thin Film Microwave... \n",
727
+ "2268249 Critical State Flux Penetration and Linear Mic... \n",
728
+ "2268250 Density of States and NMR Relaxation Rate in A... \n",
729
+ "2268251 Ginzburg Landau theory for d-wave pairing and ... \n",
730
+ "\n",
731
+ " comments \n",
732
+ "0 37 pages, 15 figures; published version \\\n",
733
+ "1 To appear in Graphs and Combinatorics \n",
734
+ "2 23 pages, 3 figures \n",
735
+ "3 11 pages \n",
736
+ "4 None \n",
737
+ "... ... \n",
738
+ "2268247 19 pages, LaTex, 6 PostScript figures; Author'... \n",
739
+ "2268248 4 pages, LaTeX type, Uses IEEE style files, 60... \n",
740
+ "2268249 20 pages, LaTeX type, Uses REVTeX style files,... \n",
741
+ "2268250 7 pages, 4 PostScript Figures, LaTeX, to appea... \n",
742
+ "2268251 12 pages including 8 eps figs, LaTeX with jpsj... \n",
743
+ "\n",
744
+ " journal-ref \n",
745
+ "0 Phys.Rev.D76:013009,2007 \\\n",
746
+ "1 None \n",
747
+ "2 None \n",
748
+ "3 None \n",
749
+ "4 Illinois J. Math. 52 (2008) no.2, 681-689 \n",
750
+ "... ... \n",
751
+ "2268247 None \n",
752
+ "2268248 None \n",
753
+ "2268249 None \n",
754
+ "2268250 None \n",
755
+ "2268251 J. Phys. Soc. Jpn. 66, 204 (1997). \n",
756
+ "\n",
757
+ " doi report-no \n",
758
+ "0 10.1103/PhysRevD.76.013009 ANL-HEP-PR-07-12 \\\n",
759
+ "1 None None \n",
760
+ "2 None None \n",
761
+ "3 None None \n",
762
+ "4 None None \n",
763
+ "... ... ... \n",
764
+ "2268247 10.1103/PhysRevB.54.15530 None \n",
765
+ "2268248 10.1109/77.620744 None \n",
766
+ "2268249 10.1103/PhysRevB.56.11989 None \n",
767
+ "2268250 10.1143/JPSJ.65.3131 None \n",
768
+ "2268251 10.1143/JPSJ.66.204 None \n",
769
+ "\n",
770
+ " categories \n",
771
+ "0 hep-ph \\\n",
772
+ "1 math.CO cs.CG \n",
773
+ "2 physics.gen-ph \n",
774
+ "3 math.CO \n",
775
+ "4 math.CA math.FA \n",
776
+ "... ... \n",
777
+ "2268247 supr-con cond-mat.supr-con \n",
778
+ "2268248 supr-con cond-mat.supr-con \n",
779
+ "2268249 supr-con cond-mat.supr-con \n",
780
+ "2268250 supr-con cond-mat.supr-con \n",
781
+ "2268251 supr-con cond-mat.supr-con \n",
782
+ "\n",
783
+ " license \n",
784
+ "0 None \\\n",
785
+ "1 http://arxiv.org/licenses/nonexclusive-distrib... \n",
786
+ "2 None \n",
787
+ "3 None \n",
788
+ "4 None \n",
789
+ "... ... \n",
790
+ "2268247 None \n",
791
+ "2268248 None \n",
792
+ "2268249 None \n",
793
+ "2268250 None \n",
794
+ "2268251 None \n",
795
+ "\n",
796
+ " abstract \n",
797
+ "0 A fully differential calculation in perturba... \\\n",
798
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
799
+ "2 The evolution of Earth-Moon system is descri... \n",
800
+ "3 We show that a determinant of Stirling cycle... \n",
801
+ "4 In this paper we show how to compute the $\\L... \n",
802
+ "... ... \n",
803
+ "2268247 We report on measurements of the angular dep... \n",
804
+ "2268248 The non-linear microwave surface impedance o... \n",
805
+ "2268249 The vortex contribution to the dc field (H) ... \n",
806
+ "2268250 We show that the density of states in an ani... \n",
807
+ "2268251 The Ginzburg Landau theory for d_{x^2-y^2}-w... \n",
808
+ "\n",
809
+ " versions update_date \n",
810
+ "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \\\n",
811
+ "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n",
812
+ "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n",
813
+ "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n",
814
+ "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n",
815
+ "... ... ... \n",
816
+ "2268247 [{'version': 'v1', 'created': 'Mon, 26 Aug 199... 2009-10-30 \n",
817
+ "2268248 [{'version': 'v1', 'created': 'Sat, 31 Aug 199... 2016-11-18 \n",
818
+ "2268249 [{'version': 'v1', 'created': 'Tue, 3 Sep 1996... 2009-10-30 \n",
819
+ "2268250 [{'version': 'v1', 'created': 'Wed, 18 Sep 199... 2009-10-30 \n",
820
+ "2268251 [{'version': 'v1', 'created': 'Wed, 25 Sep 199... 2009-10-30 \n",
821
+ "\n",
822
+ " authors_parsed \n",
823
+ "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n",
824
+ "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n",
825
+ "2 [[Pan, Hongjun, ]] \n",
826
+ "3 [[Callan, David, ]] \n",
827
+ "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] \n",
828
+ "... ... \n",
829
+ "2268247 [[Prozorov, R., ], [Konczykowski, M., ], [Schm... \n",
830
+ "2268248 [[Choudhury, Durga P., , Physics Department, N... \n",
831
+ "2268249 [[Willemsen, Balam A., , Physics Department,\\n... \n",
832
+ "2268250 [[Hasegawa, Yasumasa, , Himeji Institute of Te... \n",
833
+ "2268251 [[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... \n",
834
+ "\n",
835
+ "[2268252 rows x 14 columns]"
836
+ ]
837
+ },
838
+ "execution_count": 13,
839
+ "metadata": {},
840
+ "output_type": "execute_result"
841
+ }
842
+ ],
843
+ "source": [
844
+ "df"
845
+ ]
846
+ }
847
+ ],
848
+ "metadata": {
849
+ "kernelspec": {
850
+ "display_name": "Python 3.11.3 ('arxiv-env': venv)",
851
+ "language": "python",
852
+ "name": "python3"
853
+ },
854
+ "language_info": {
855
+ "codemirror_mode": {
856
+ "name": "ipython",
857
+ "version": 3
858
+ },
859
+ "file_extension": ".py",
860
+ "mimetype": "text/x-python",
861
+ "name": "python",
862
+ "nbconvert_exporter": "python",
863
+ "pygments_lexer": "ipython3",
864
+ "version": "3.11.4"
865
+ },
866
+ "orig_nbformat": 4,
867
+ "vscode": {
868
+ "interpreter": {
869
+ "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363"
870
+ }
871
+ }
872
+ },
873
+ "nbformat": 4,
874
+ "nbformat_minor": 2
875
+ }
reports/.gitkeep ADDED
File without changes
reports/Visualizations/DisciplinasMasPopulares.png ADDED
reports/Visualizations/DisciplinasMenosPopulares.png ADDED
reports/Visualizations/FrequenciaPalabras.png ADDED
reports/Visualizations/FrequenciaPalabrasMenosFreq.png ADDED
reports/Visualizations/HsitogramAbstractsLen.png ADDED
reports/Visualizations/Papes_release_year_by_Computer Science.png ADDED
reports/Visualizations/Papes_release_year_by_Economics.png ADDED
Systems Science.png RENAMED
File without changes
reports/Visualizations/Papes_release_year_by_Mathematics.png ADDED
reports/Visualizations/Papes_release_year_by_Physics.png ADDED
reports/Visualizations/Papes_release_year_by_Quantitative Biology.png ADDED
reports/Visualizations/Papes_release_year_by_Quantitative Finance.png ADDED
reports/Visualizations/Papes_release_year_by_Statistics.png ADDED
reports/Visualizations/PublicacionPapersAnno.png ADDED
reports/Visualizations/TopCatsPapers.png ADDED
reports/figures/.gitkeep ADDED
File without changes
reports/figures/arxiv-logo.jpg ADDED
reports/figures/profile.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.0.1
2
+ attrs==23.1.0
3
+ beautifulsoup4==4.12.2
4
+ blinker==1.6.2
5
+ cachetools==5.3.1
6
+ certifi==2023.5.7
7
+ charset-normalizer==3.2.0
8
+ click==8.1.4
9
+ contourpy==1.1.0
10
+ cramjam==2.6.2
11
+ cycler==0.11.0
12
+ decorator==5.1.1
13
+ Faker==19.1.0
14
+ fastparquet==2023.7.0
15
+ favicon==0.7.0
16
+ fonttools==4.41.0
17
+ fsspec==2023.6.0
18
+ gensim==4.3.1
19
+ gitdb==4.0.10
20
+ GitPython==3.1.32
21
+ htbuilder==0.6.1
22
+ idna==3.4
23
+ importlib-metadata==6.8.0
24
+ importlib-resources==6.0.0
25
+ Jinja2==3.1.2
26
+ jsonschema==4.18.2
27
+ jsonschema-specifications==2023.6.1
28
+ kiwisolver==1.4.4
29
+ lxml==4.9.3
30
+ Markdown==3.4.3
31
+ markdown-it-py==3.0.0
32
+ markdownlit==0.0.7
33
+ MarkupSafe==2.1.3
34
+ matplotlib==3.7.2
35
+ mdurl==0.1.2
36
+ more-itertools==9.1.0
37
+ numpy==1.25.1
38
+ packaging==23.1
39
+ pandas==2.0.3
40
+ Pillow==9.5.0
41
+ protobuf==4.23.4
42
+ pyarrow==12.0.1
43
+ pydeck==0.8.1b0
44
+ Pygments==2.15.1
45
+ pymdown-extensions==10.0.1
46
+ Pympler==1.0.1
47
+ pyparsing==3.0.9
48
+ python-dateutil==2.8.2
49
+ pytz==2023.3
50
+ pytz-deprecation-shim==0.1.0.post0
51
+ PyYAML==6.0
52
+ referencing==0.29.1
53
+ requests==2.31.0
54
+ rich==13.4.2
55
+ rpds-py==0.8.10
56
+ scipy==1.11.1
57
+ six==1.16.0
58
+ smart-open==6.3.0
59
+ smmap==5.0.0
60
+ soupsieve==2.4.1
61
+ st-annotated-text==4.0.0
62
+ streamlit==1.24.1
63
+ streamlit-camera-input-live==0.2.0
64
+ streamlit-card==0.0.61
65
+ streamlit-embedcode==0.1.2
66
+ streamlit-extras==0.2.7
67
+ streamlit-faker==0.0.2
68
+ streamlit-image-coordinates==0.1.5
69
+ streamlit-keyup==0.2.0
70
+ streamlit-toggle-switch==1.0.2
71
+ streamlit-vertical-slider==1.0.2
72
+ tenacity==8.2.2
73
+ toml==0.10.2
74
+ toolz==0.12.0
75
+ tornado==6.3.2
76
+ typing_extensions==4.7.1
77
+ tzdata==2023.3
78
+ tzlocal==4.3.1
79
+ urllib3==2.0.3
80
+ validators==0.20.0
81
+ zipp==3.16.0
src/app.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.no_default_selectbox import selectbox
3
+ import pandas as pd
4
+ from PIL import Image
5
+ from random import choices
6
+ import zipfile
7
+ import os
8
+
9
+ from gensim.corpora import Dictionary
10
+ from gensim.models import TfidfModel
11
+ from gensim.similarities import SparseMatrixSimilarity
12
+
13
+ from models.utils.constants import user_requests_tests, TEST_INPUTS
14
+ from models.utils.mlutilities import gensim_tokenizer, get_recomendations_metadata
15
+
16
+
17
+ st.set_page_config(page_title="Papers Recomendation App")
18
+
19
+ model_name = "GrammarGuru"
20
+
21
+ def folder_exists(folder_path):
22
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
23
+ return True
24
+ else:
25
+ return False
26
+
27
+
28
+
29
+ def unzip_file(zip_file_path: str, modelname: str = model_name):
30
+ if not folder_exists(f"models/{modelname}"):
31
+ try:
32
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
33
+ zip_ref.extractall(f"models/")
34
+ st.write("Model Zip file Extraction completed!.")
35
+ except FileNotFoundError:
36
+ raise("Error: The specified zip file was not found.")
37
+ except zipfile.BadZipFile:
38
+ raise("Error: The specified file is not a valid zip file.")
39
+
40
+
41
+ hide_default_format = """
42
+ <style>
43
+ #MainMenu {visibility: hidden; }
44
+ footer {visibility: hidden;}
45
+ </style>
46
+ """
47
+ st.markdown(hide_default_format, unsafe_allow_html=True)
48
+
49
+ image = Image.open('reports/figures/arxiv-logo.jpg')
50
+
51
+ st.sidebar.image(image , caption="Arxiv Papers Recomendation System",width = 256)
52
+ app_mode = st.sidebar.selectbox("Choose app mode", ["Generate Recomendations", "About this Project", "About Me"])
53
+
54
+ st.title("ResearchRadar")
55
+
56
+
57
+ @st.cache_data
58
+ def load_papers_corpus(path: str):
59
+ return pd.read_parquet(path)
60
+
61
+ @st.cache_resource
62
+ def load_dict(path: str):
63
+ dict_corpus = Dictionary.load(path)
64
+ return dict_corpus
65
+
66
+ @st.cache_resource
67
+ def load_model(path: str ):
68
+ tfidf_model = TfidfModel.load(path)
69
+ return tfidf_model
70
+
71
+ @st.cache_resource
72
+ def load_sparse_matrix(path: str):
73
+ similarities = SparseMatrixSimilarity.load(path)
74
+ return similarities
75
+
76
+
77
+ if app_mode == "Generate Recomendations":
78
+ welcome_text = """
79
+ <div style="text-align: justify">Welcome to my paper recommendation project! This App is here to simplify your search for relevant scientific and academic papers. Our intelligent recommendation system, powered by <strong>Machine Learning and natural language processing</strong>, analyzes keywords, abstracts, titles, authors, and more to provide personalized suggestions based on your interests. Say goodbye to information overload and let us guide you towards new horizons in your quest for knowledge.
80
+ """
81
+ subjects = """
82
+ Our model is trained to recommend papers in various domains, including:
83
+ - Mathematics
84
+ - Statistics
85
+ - Electrical Engineering
86
+ - Quantitative Biology
87
+ - Economics
88
+
89
+ Say goodbye to information overload and let us guide you towards **new horizons** in your quest for knowledge. Join us and discover a streamlined way to **explore, learn, and stay ahead** in your field. Welcome aboard!
90
+ """
91
+ st.markdown(welcome_text, unsafe_allow_html=True)
92
+ st.markdown(subjects)
93
+ st.divider()
94
+
95
+
96
+ with st.container():
97
+ examples = """
98
+ ### Examples of prompts
99
+ - "Can you recommend papers that explore the application of deep learning in computer vision for object detection, image segmentation, and video analysis?"
100
+ - "Can you recommend papers that explore the use of deep reinforcement learning for autonomous driving, including perception, planning, and control?"
101
+ - "Could you provide papers on image and video compression algorithms based on the latest video coding standards, such as HEVC and AV1?"
102
+ - "Can you suggest recent papers on behavioral economics that investigate the role of emotions and biases in decision-making under uncertainty, particularly in the context of financial markets?"
103
+
104
+ """
105
+ st.markdown(examples)
106
+ st.divider()
107
+
108
+
109
+ with st.spinner('The model binaries are unziping ...'):
110
+ zip_file_path = "models/GrammarGuru.zip"
111
+ unzip_file(zip_file_path)
112
+
113
+ with st.spinner('The model binaries are loading, please wait...'):
114
+
115
+ df = load_papers_corpus("models/GrammarGuru/data/GrammarGuru.parquet.gzip")
116
+ dictionary = load_dict("models/GrammarGuru/dictionaries/GrammarGuru.dict")
117
+ model = load_model("models/GrammarGuru/tdidf/GrammarGuru.model")
118
+ matrix = load_sparse_matrix("models/GrammarGuru/similarities_matrix/GrammarGuru")
119
+ st.success('Models Loaded, yei!', icon="🚀")
120
+
121
+ st.markdown("#### Generate Recommendations")
122
+ # recs_number = st.slider("Enter the number of papers you need", min_value=1, max_value=10, value=3)
123
+ query = st.text_input("Enter the description of the Paper you need (the more descriptive, the better)", value="")
124
+
125
+ if query != "":
126
+ cleaned_prompt = gensim_tokenizer(query)
127
+
128
+ with st.spinner('Generating Recommendations ... '):
129
+ results_df = get_recomendations_metadata(query=query, df=df, n=3, dictionary=dictionary, index=matrix, tfidf_model=model)
130
+
131
+ ids = results_df['id'].to_list()
132
+ titles = results_df['title'].to_list()
133
+ authors = results_df['authors'].to_list()
134
+ categories = results_df['categories'].to_list()
135
+ abstracts = results_df['abstract'].to_list()
136
+ release_date = results_df['update_date'].to_list()
137
+
138
+ results = list(zip(ids, titles, authors, categories, abstracts, release_date))
139
+
140
+ st.write("Your top 3 papers:")
141
+ for result in results:
142
+ with st.container():
143
+ col1, col2 = st.columns([1,3])
144
+
145
+ with col1:
146
+ st.markdown(f"**Title:**")
147
+ st.markdown(f"**Author:**")
148
+ st.markdown(f"**Categories:**")
149
+ st.markdown(f"**release_date:**")
150
+ st.markdown(f"**Abstract:**")
151
+
152
+
153
+ with col2:
154
+ st.write(f"Title: {result[1]}")
155
+ st.write(f"Author: {result[2]}")
156
+ st.write(f"Categories: {result[3]}")
157
+ st.write(f"release_date: {result[5]}")
158
+ st.write(f"Abstract: {result[4]}")
159
+ st.markdown(f"""[Paper Link](https://arxiv.org/abs/{result[0]})""")
160
+ st.divider()
161
+ st.balloons()
162
+
163
+ else:
164
+ st.write("Please enter your prompt :)")
165
+
166
+
167
+
168
+
169
+
170
+ elif app_mode == "About this Project":
171
+ intro_text = """
172
+ Welcome to my paper recommendation project! This application aims to simplify and speed up the process of finding relevant scientific and academic papers. It utilizes Machine Learning techniques and natural language processing to provide an effective solution for students, researchers, and general users.
173
+
174
+ ### Key Features
175
+
176
+ - **Intelligent Recommendation System:** The application uses advanced algorithms to analyze keywords, abstracts, titles, authors, and other metadata associated with each paper.
177
+ - **Efficient Discovery Process:** By leveraging machine learning, the system identifies and suggests the most relevant papers based on the user's interests and areas of study.
178
+ - **Comprehensive Analysis:** The recommendation system performs an exhaustive analysis of various aspects of each paper to ensure accurate and targeted recommendations.
179
+ - **Time-saving Solution:** Instead of manually searching through vast amounts of information, users can rely on this application to streamline the paper discovery process.
180
+
181
+ ### Available Models
182
+
183
+ - SemanticSherlock: trained on 100% of the data
184
+ - LanguageLiberator: trained on 75% of the data
185
+ - TextualTango: trained on 50% of the data
186
+ - GrammarGuru: trained on 25% of the data **(Deployed Version)**
187
+
188
+ **Note:** Due to resource limitations on the free tier of Streamlit, only the GrammarGuru version of the model is available for deployment.
189
+
190
+
191
+ ### Benefits
192
+
193
+ - **Saves Time and Effort:** With the application's intelligent algorithms, users can avoid the challenges and time-consuming nature of searching for papers on their own.
194
+ - **Increased Relevance:** By considering keywords, abstracts, titles, authors, and other metadata, the recommendation system provides users with highly relevant paper suggestions.
195
+ - **Tailored to User Interests:** The system takes into account each user's interests and areas of study, ensuring that the recommended papers align with their specific needs.
196
+ - **Accessible to All Users:** Whether you are a student, researcher, or general user, this application is designed to cater to a wide range of users' needs.
197
+
198
+ ### Get Started
199
+
200
+ Explore, discover, and reach new horizons in your search for knowledge with our paper recommendation application. Simplify your journey to finding relevant papers and stay ahead in your field.
201
+
202
+ Take a look to this proyect in my [GitHub Repo](https://github.com/LewisPons/arxiv-paper-recommender-system)
203
+ """
204
+
205
+
206
+
207
+ st.markdown(intro_text)
208
+
209
+
210
+
211
+ elif app_mode == "About Me":
212
+ st.title('About Me')
213
+ mkdn = """
214
+ <p style="text-align: justify;">Hey there! I'm <strong>Luis Morales</strong>, a passionate data professional with a background in Actuarial Sciences and expertise in Data Engineering and Machine Learning. I love diving into complex data projects and helping organizations unlock the power of their data. From designing robust data pipelines to building powerful ML models, I enjoy the thrill of turning raw data into actionable insights. With my coding skills in Python and R, I'm always up for tackling challenging projects and learning new technologies along the way.
215
+ Thank you for taking the time to learn a little bit about me!</p>
216
+ """
217
+ st.markdown(mkdn, unsafe_allow_html=True)
218
+ st.success("Feel free to contact me here 👇 ")
219
+
220
+ col1,col2,col3,col4 = st.columns((2,1,2,1))
221
+ col1.markdown('* [LinkedIn](https://www.linkedin.com/in/luis-morales-ponce/)')
222
+ col1.markdown('* [GitHub](https://github.com/LewisPons)')
223
+ image2 = Image.open('reports/figures/profile.jpeg')
224
+ st.image(image2, width=400)
225
+
226
+
src/data/.gitkeep ADDED
File without changes
src/data/__init__.py ADDED
File without changes
src/data/make_dataset.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import click
3
+ import logging
4
+ from pathlib import Path
5
+ from dotenv import find_dotenv, load_dotenv
6
+
7
+
8
+ @click.command()
9
+ @click.argument('input_filepath', type=click.Path(exists=True))
10
+ @click.argument('output_filepath', type=click.Path())
11
+ def main(input_filepath, output_filepath):
12
+ """ Runs data processing scripts to turn raw data from (../raw) into
13
+ cleaned data ready to be analyzed (saved in ../processed).
14
+ """
15
+ logger = logging.getLogger(__name__)
16
+ logger.info('making final data set from raw data')
17
+
18
+
19
+ if __name__ == '__main__':
20
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
22
+
23
+ # not used in this stub but often useful for finding various files
24
+ project_dir = Path(__file__).resolve().parents[2]
25
+
26
+ # find .env automagically by walking up directories until it's found, then
27
+ # load up the .env entries as environment variables
28
+ load_dotenv(find_dotenv())
29
+
30
+ main()
src/data/transform_raw_data.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is a utility script for use in sagemaker
3
+ """
4
+
5
+ import json
6
+ import pandas as pd
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ import os
10
+ from tqdm import tqdm
11
+
12
+ # File paths
13
+ json_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/arxiv-metadata-oai-snapshot.json"
14
+ parquet_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/data/processed/arxiv_papers_raw.parquet.gzip"
15
+
16
+ # Batch size
17
+ batch_size = 10000
18
+
19
+ # Create the parent directory if it doesn't exist
20
+ parent_dir = os.path.dirname(parquet_file_path)
21
+ os.makedirs(parent_dir, exist_ok=True)
22
+
23
+ # Open the JSON file
24
+ with open(json_file_path, 'r') as file:
25
+ # Initialize an empty list to store the data
26
+ arxiv_data = []
27
+ processed_count = 0
28
+
29
+ # Iterate over each line in the file
30
+ for line in tqdm(file):
31
+ # Load the JSON data from each line and append it to the arxiv_data list
32
+ arxiv_data.append(json.loads(line))
33
+
34
+ processed_count += 1
35
+
36
+ # Process a batch of data
37
+ if processed_count % batch_size == 0:
38
+ df = pd.DataFrame.from_records(arxiv_data)
39
+ # Convert the batch to parquet and append it to the file
40
+ # df.to_parquet(parquet_file_path, compression='gzip', engine='pyarrow', index=False, append=True)
41
+ # Create a parquet table from your dataframe
42
+ table = pa.Table.from_pandas(df)
43
+
44
+ # Write direct to your parquet file
45
+ pq.write_to_dataset(table , root_path=parquet_file_path)
46
+ arxiv_data = []
47
+
48
+ # Process the remaining data (if any)
49
+ if arxiv_data:
50
+ df = pd.DataFrame.from_records(arxiv_data)
51
+ # Convert the remaining batch to parquet and append it to the file
52
+ # df.to_parquet(parquet_file_path, compression='gzip', engine='pyarrow', index=False, append=True)
53
+ pq.write_to_dataset(parquet_file_path , root_path=parquet_file_path)
src/features/.gitkeep ADDED
File without changes
src/features/__init__.py ADDED
File without changes
src/features/build_features.py ADDED
File without changes
src/models/.gitkeep ADDED
File without changes
src/models/__init__.py ADDED
File without changes
src/models/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (182 Bytes). View file
 
src/models/configs.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ GensimConfig:
3
+ Large:
4
+ ModelName: SemanticSherlock
5
+ DataSetFracSplit: null
6
+ RandomSeedSplit: null
7
+ SubLarge:
8
+ ModelName: LanguageLiberator
9
+ DataSetFracSplit: 0.75
10
+ RandomSeedSplit: 98765
11
+ Medium:
12
+ ModelName: TextualTango
13
+ DataSetFracSplit: 0.5
14
+ RandomSeedSplit: 42355
15
+ Small:
16
+ ModelName: GrammarGuru
17
+ DataSetFracSplit: 0.25
18
+ RandomSeedSplit: 76438
src/models/gensim_tfidf.ipynb ADDED
@@ -0,0 +1,862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "2023-07-07 17:13:01.457105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
13
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import pandas as pd\n",
19
+ "import gensim\n",
20
+ "import pprint\n",
21
+ "from gensim import corpora\n",
22
+ "from gensim.utils import simple_preprocess\n",
23
+ "from gensim.models import TfidfModel\n",
24
+ "from gensim.parsing import strip_tags, strip_numeric, \\\n",
25
+ " strip_multiple_whitespaces, stem_text, strip_punctuation, \\\n",
26
+ " remove_stopwords, preprocess_string\n",
27
+ "import re\n",
28
+ "import os\n",
29
+ "\n",
30
+ "from typing import List\n",
31
+ "import spacy"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 5,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "transform_to_lower = lambda s: s.lower()\n",
41
+ "remove_single_char = lambda s: re.sub(r'\\s+\\w{1}\\s+', '', s)\n",
42
+ "\n",
43
+ "cleaning_filters = [\n",
44
+ " strip_tags,\n",
45
+ " strip_numeric,\n",
46
+ " strip_punctuation, \n",
47
+ " strip_multiple_whitespaces, \n",
48
+ " transform_to_lower,\n",
49
+ " remove_stopwords,\n",
50
+ " remove_single_char\n",
51
+ "]"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 95,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\")"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 94,
66
+ "metadata": {},
67
+ "outputs": [
68
+ {
69
+ "data": {
70
+ "text/plain": [
71
+ "638707"
72
+ ]
73
+ },
74
+ "execution_count": 94,
75
+ "metadata": {},
76
+ "output_type": "execute_result"
77
+ }
78
+ ],
79
+ "source": [
80
+ "int(df.shape[0] * 0.75) "
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": []
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 73,
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\").sample().reset_index(drop=True)"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 74,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/html": [
107
+ "<div>\n",
108
+ "<style scoped>\n",
109
+ " .dataframe tbody tr th:only-of-type {\n",
110
+ " vertical-align: middle;\n",
111
+ " }\n",
112
+ "\n",
113
+ " .dataframe tbody tr th {\n",
114
+ " vertical-align: top;\n",
115
+ " }\n",
116
+ "\n",
117
+ " .dataframe thead th {\n",
118
+ " text-align: right;\n",
119
+ " }\n",
120
+ "</style>\n",
121
+ "<table border=\"1\" class=\"dataframe\">\n",
122
+ " <thead>\n",
123
+ " <tr style=\"text-align: right;\">\n",
124
+ " <th></th>\n",
125
+ " <th>id</th>\n",
126
+ " <th>submitter</th>\n",
127
+ " <th>authors</th>\n",
128
+ " <th>title</th>\n",
129
+ " <th>comments</th>\n",
130
+ " <th>journal-ref</th>\n",
131
+ " <th>doi</th>\n",
132
+ " <th>report-no</th>\n",
133
+ " <th>categories</th>\n",
134
+ " <th>license</th>\n",
135
+ " <th>abstract</th>\n",
136
+ " <th>versions</th>\n",
137
+ " <th>update_date</th>\n",
138
+ " <th>authors_parsed</th>\n",
139
+ " <th>cleaned_abstracts</th>\n",
140
+ " <th>len_abstract</th>\n",
141
+ " </tr>\n",
142
+ " </thead>\n",
143
+ " <tbody>\n",
144
+ " <tr>\n",
145
+ " <th>0</th>\n",
146
+ " <td>2007.00905</td>\n",
147
+ " <td>Song Qingheng</td>\n",
148
+ " <td>Qingheng Song, Yong Zeng, Jie Xu, and Shi Jin</td>\n",
149
+ " <td>A Survey of Prototype and Experiment for UAV C...</td>\n",
150
+ " <td>24 pages, 6 figures</td>\n",
151
+ " <td>None</td>\n",
152
+ " <td>None</td>\n",
153
+ " <td>None</td>\n",
154
+ " <td>cs.IT eess.SP math.IT</td>\n",
155
+ " <td>http://creativecommons.org/licenses/by-nc-sa/4.0/</td>\n",
156
+ " <td>Unmanned aerial vehicle (UAV) communications...</td>\n",
157
+ " <td>[{'created': 'Thu, 2 Jul 2020 06:26:20 GMT', '...</td>\n",
158
+ " <td>2020-07-03</td>\n",
159
+ " <td>[[Song, Qingheng, ], [Zeng, Yong, ], [Xu, Jie,...</td>\n",
160
+ " <td>unmanned aerial vehicle uav communication a...</td>\n",
161
+ " <td>865</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <th>1</th>\n",
165
+ " <td>2102.04209</td>\n",
166
+ " <td>Michael Stuart</td>\n",
167
+ " <td>Michael T. Stuart and Markus Kneer</td>\n",
168
+ " <td>Guilty Artificial Minds</td>\n",
169
+ " <td>20 pages, 4 figures, 1 table</td>\n",
170
+ " <td>None</td>\n",
171
+ " <td>None</td>\n",
172
+ " <td>None</td>\n",
173
+ " <td>cs.CY cs.AI cs.HC</td>\n",
174
+ " <td>http://creativecommons.org/licenses/by/4.0/</td>\n",
175
+ " <td>The concepts of blameworthiness and wrongnes...</td>\n",
176
+ " <td>[{'created': 'Sun, 24 Jan 2021 21:37:35 GMT', ...</td>\n",
177
+ " <td>2021-02-09</td>\n",
178
+ " <td>[[Stuart, Michael T., ], [Kneer, Markus, ]]</td>\n",
179
+ " <td>concept blameworthiness wrongness fundament...</td>\n",
180
+ " <td>739</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>2</th>\n",
184
+ " <td>1201.5796</td>\n",
185
+ " <td>Denis Jerome</td>\n",
186
+ " <td>Denis Jerome</td>\n",
187
+ " <td>Organic Superconductors: when correlations and...</td>\n",
188
+ " <td>41 pages, 21 figures to be published in Journa...</td>\n",
189
+ " <td>None</td>\n",
190
+ " <td>10.1007/s10948-012-1475-7</td>\n",
191
+ " <td>None</td>\n",
192
+ " <td>cond-mat.supr-con</td>\n",
193
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
194
+ " <td>This survey provides a brief account for the...</td>\n",
195
+ " <td>[{'created': 'Fri, 27 Jan 2012 15:24:46 GMT', ...</td>\n",
196
+ " <td>2012-02-21</td>\n",
197
+ " <td>[[Jerome, Denis, ]]</td>\n",
198
+ " <td>survey provide brief account start organic ...</td>\n",
199
+ " <td>649</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>3</th>\n",
203
+ " <td>1511.03076</td>\n",
204
+ " <td>Emma Platts Miss</td>\n",
205
+ " <td>George F.R. Ellis, Emma Platts, David Sloan an...</td>\n",
206
+ " <td>Current observations with a decaying cosmologi...</td>\n",
207
+ " <td>23 pages, 11 figures</td>\n",
208
+ " <td>None</td>\n",
209
+ " <td>10.1088/1475-7516/2016/04/026</td>\n",
210
+ " <td>None</td>\n",
211
+ " <td>astro-ph.CO gr-qc hep-th</td>\n",
212
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
213
+ " <td>We use the phase plane analysis technique of...</td>\n",
214
+ " <td>[{'created': 'Tue, 10 Nov 2015 12:08:23 GMT', ...</td>\n",
215
+ " <td>2016-04-27</td>\n",
216
+ " <td>[[Ellis, George F. R., ], [Platts, Emma, ], [S...</td>\n",
217
+ " <td>use phase plane analysis technique madsen e...</td>\n",
218
+ " <td>554</td>\n",
219
+ " </tr>\n",
220
+ " <tr>\n",
221
+ " <th>4</th>\n",
222
+ " <td>1710.02954</td>\n",
223
+ " <td>Kirk Bansak</td>\n",
224
+ " <td>Kirk Bansak</td>\n",
225
+ " <td>Estimating Causal Moderation Effects with Rand...</td>\n",
226
+ " <td>Forthcoming, Journal of the Royal Statistical ...</td>\n",
227
+ " <td>None</td>\n",
228
+ " <td>None</td>\n",
229
+ " <td>None</td>\n",
230
+ " <td>stat.ME</td>\n",
231
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
232
+ " <td>Researchers are often interested in analyzin...</td>\n",
233
+ " <td>[{'created': 'Mon, 9 Oct 2017 06:34:01 GMT', '...</td>\n",
234
+ " <td>2020-08-25</td>\n",
235
+ " <td>[[Bansak, Kirk, ]]</td>\n",
236
+ " <td>researcher interested analyze conditional t...</td>\n",
237
+ " <td>799</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>...</th>\n",
241
+ " <td>...</td>\n",
242
+ " <td>...</td>\n",
243
+ " <td>...</td>\n",
244
+ " <td>...</td>\n",
245
+ " <td>...</td>\n",
246
+ " <td>...</td>\n",
247
+ " <td>...</td>\n",
248
+ " <td>...</td>\n",
249
+ " <td>...</td>\n",
250
+ " <td>...</td>\n",
251
+ " <td>...</td>\n",
252
+ " <td>...</td>\n",
253
+ " <td>...</td>\n",
254
+ " <td>...</td>\n",
255
+ " <td>...</td>\n",
256
+ " <td>...</td>\n",
257
+ " </tr>\n",
258
+ " <tr>\n",
259
+ " <th>851605</th>\n",
260
+ " <td>1301.0707</td>\n",
261
+ " <td>Sebastian Klein</td>\n",
262
+ " <td>Sebastian Klein</td>\n",
263
+ " <td>Chow groups of tensor triangulated categories</td>\n",
264
+ " <td>40 pages. The presentation of the article has ...</td>\n",
265
+ " <td>None</td>\n",
266
+ " <td>None</td>\n",
267
+ " <td>None</td>\n",
268
+ " <td>math.AG math.CT math.RT</td>\n",
269
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
270
+ " <td>We recall P. Balmer's definition of tensor t...</td>\n",
271
+ " <td>[{'created': 'Fri, 4 Jan 2013 11:06:40 GMT', '...</td>\n",
272
+ " <td>2015-10-02</td>\n",
273
+ " <td>[[Klein, Sebastian, ]]</td>\n",
274
+ " <td>recall p. balmer definition tensor triangul...</td>\n",
275
+ " <td>787</td>\n",
276
+ " </tr>\n",
277
+ " <tr>\n",
278
+ " <th>851606</th>\n",
279
+ " <td>1707.00341</td>\n",
280
+ " <td>Giorgos Anastasiou</td>\n",
281
+ " <td>Giorgos Anastasiou, Rodrigo Olea, David Rivera...</td>\n",
282
+ " <td>Noether-Wald energy in Critical Gravity</td>\n",
283
+ " <td>7 pages, no figures, Final version for PLB</td>\n",
284
+ " <td>None</td>\n",
285
+ " <td>10.1016/j.physletb.2018.11.021</td>\n",
286
+ " <td>None</td>\n",
287
+ " <td>hep-th gr-qc</td>\n",
288
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
289
+ " <td>Criticality represents a specific point in t...</td>\n",
290
+ " <td>[{'created': 'Sun, 2 Jul 2017 19:52:32 GMT', '...</td>\n",
291
+ " <td>2018-11-21</td>\n",
292
+ " <td>[[Anastasiou, Giorgos, ], [Olea, Rodrigo, ], [...</td>\n",
293
+ " <td>criticality represent specific point parame...</td>\n",
294
+ " <td>631</td>\n",
295
+ " </tr>\n",
296
+ " <tr>\n",
297
+ " <th>851607</th>\n",
298
+ " <td>1610.08526</td>\n",
299
+ " <td>Blagoje Oblak</td>\n",
300
+ " <td>Blagoje Oblak</td>\n",
301
+ " <td>BMS Particles in Three Dimensions</td>\n",
302
+ " <td>437 pages (including index), 33 figures. Appen...</td>\n",
303
+ " <td>None</td>\n",
304
+ " <td>10.1007/978-3-319-61878-4</td>\n",
305
+ " <td>None</td>\n",
306
+ " <td>hep-th gr-qc math-ph math.GR math.MP math.RT</td>\n",
307
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
308
+ " <td>This thesis is devoted to the group-theoreti...</td>\n",
309
+ " <td>[{'created': 'Wed, 26 Oct 2016 20:00:16 GMT', ...</td>\n",
310
+ " <td>2018-01-29</td>\n",
311
+ " <td>[[Oblak, Blagoje, ]]</td>\n",
312
+ " <td>thesis devoted group theoretic aspect dimen...</td>\n",
313
+ " <td>542</td>\n",
314
+ " </tr>\n",
315
+ " <tr>\n",
316
+ " <th>851608</th>\n",
317
+ " <td>1211.6629</td>\n",
318
+ " <td>Philippe Joyez</td>\n",
319
+ " <td>Philippe Joyez</td>\n",
320
+ " <td>Self-consistent dynamics of a Josephson juncti...</td>\n",
321
+ " <td>7 pages, 1 figure</td>\n",
322
+ " <td>None</td>\n",
323
+ " <td>10.1103/PhysRevLett.110.217003</td>\n",
324
+ " <td>None</td>\n",
325
+ " <td>cond-mat.supr-con cond-mat.mes-hall</td>\n",
326
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
327
+ " <td>We derive microscopically the dynamics assoc...</td>\n",
328
+ " <td>[{'created': 'Tue, 27 Nov 2012 17:29:04 GMT', ...</td>\n",
329
+ " <td>2013-05-29</td>\n",
330
+ " <td>[[Joyez, Philippe, ]]</td>\n",
331
+ " <td>derive microscopically dynamic associate d....</td>\n",
332
+ " <td>558</td>\n",
333
+ " </tr>\n",
334
+ " <tr>\n",
335
+ " <th>851609</th>\n",
336
+ " <td>0705.2878</td>\n",
337
+ " <td>Benoit Perthame</td>\n",
338
+ " <td>Benoit Perthame (DMA), Panagiotis E. Souganidis</td>\n",
339
+ " <td>Asymmetric potentials and motor effect: a larg...</td>\n",
340
+ " <td>None</td>\n",
341
+ " <td>None</td>\n",
342
+ " <td>None</td>\n",
343
+ " <td>None</td>\n",
344
+ " <td>math.AP</td>\n",
345
+ " <td>None</td>\n",
346
+ " <td>We provide a mathematical analysis of appear...</td>\n",
347
+ " <td>[{'created': 'Sun, 20 May 2007 17:43:39 GMT', ...</td>\n",
348
+ " <td>2007-05-23</td>\n",
349
+ " <td>[[Perthame, Benoit, , DMA], [Souganidis, Panag...</td>\n",
350
+ " <td>provide mathematical analysis appearance co...</td>\n",
351
+ " <td>518</td>\n",
352
+ " </tr>\n",
353
+ " </tbody>\n",
354
+ "</table>\n",
355
+ "<p>851610 rows × 16 columns</p>\n",
356
+ "</div>"
357
+ ],
358
+ "text/plain": [
359
+ " id submitter \n",
360
+ "0 2007.00905 Song Qingheng \\\n",
361
+ "1 2102.04209 Michael Stuart \n",
362
+ "2 1201.5796 Denis Jerome \n",
363
+ "3 1511.03076 Emma Platts Miss \n",
364
+ "4 1710.02954 Kirk Bansak \n",
365
+ "... ... ... \n",
366
+ "851605 1301.0707 Sebastian Klein \n",
367
+ "851606 1707.00341 Giorgos Anastasiou \n",
368
+ "851607 1610.08526 Blagoje Oblak \n",
369
+ "851608 1211.6629 Philippe Joyez \n",
370
+ "851609 0705.2878 Benoit Perthame \n",
371
+ "\n",
372
+ " authors \n",
373
+ "0 Qingheng Song, Yong Zeng, Jie Xu, and Shi Jin \\\n",
374
+ "1 Michael T. Stuart and Markus Kneer \n",
375
+ "2 Denis Jerome \n",
376
+ "3 George F.R. Ellis, Emma Platts, David Sloan an... \n",
377
+ "4 Kirk Bansak \n",
378
+ "... ... \n",
379
+ "851605 Sebastian Klein \n",
380
+ "851606 Giorgos Anastasiou, Rodrigo Olea, David Rivera... \n",
381
+ "851607 Blagoje Oblak \n",
382
+ "851608 Philippe Joyez \n",
383
+ "851609 Benoit Perthame (DMA), Panagiotis E. Souganidis \n",
384
+ "\n",
385
+ " title \n",
386
+ "0 A Survey of Prototype and Experiment for UAV C... \\\n",
387
+ "1 Guilty Artificial Minds \n",
388
+ "2 Organic Superconductors: when correlations and... \n",
389
+ "3 Current observations with a decaying cosmologi... \n",
390
+ "4 Estimating Causal Moderation Effects with Rand... \n",
391
+ "... ... \n",
392
+ "851605 Chow groups of tensor triangulated categories \n",
393
+ "851606 Noether-Wald energy in Critical Gravity \n",
394
+ "851607 BMS Particles in Three Dimensions \n",
395
+ "851608 Self-consistent dynamics of a Josephson juncti... \n",
396
+ "851609 Asymmetric potentials and motor effect: a larg... \n",
397
+ "\n",
398
+ " comments journal-ref \n",
399
+ "0 24 pages, 6 figures None \\\n",
400
+ "1 20 pages, 4 figures, 1 table None \n",
401
+ "2 41 pages, 21 figures to be published in Journa... None \n",
402
+ "3 23 pages, 11 figures None \n",
403
+ "4 Forthcoming, Journal of the Royal Statistical ... None \n",
404
+ "... ... ... \n",
405
+ "851605 40 pages. The presentation of the article has ... None \n",
406
+ "851606 7 pages, no figures, Final version for PLB None \n",
407
+ "851607 437 pages (including index), 33 figures. Appen... None \n",
408
+ "851608 7 pages, 1 figure None \n",
409
+ "851609 None None \n",
410
+ "\n",
411
+ " doi report-no \n",
412
+ "0 None None \\\n",
413
+ "1 None None \n",
414
+ "2 10.1007/s10948-012-1475-7 None \n",
415
+ "3 10.1088/1475-7516/2016/04/026 None \n",
416
+ "4 None None \n",
417
+ "... ... ... \n",
418
+ "851605 None None \n",
419
+ "851606 10.1016/j.physletb.2018.11.021 None \n",
420
+ "851607 10.1007/978-3-319-61878-4 None \n",
421
+ "851608 10.1103/PhysRevLett.110.217003 None \n",
422
+ "851609 None None \n",
423
+ "\n",
424
+ " categories \n",
425
+ "0 cs.IT eess.SP math.IT \\\n",
426
+ "1 cs.CY cs.AI cs.HC \n",
427
+ "2 cond-mat.supr-con \n",
428
+ "3 astro-ph.CO gr-qc hep-th \n",
429
+ "4 stat.ME \n",
430
+ "... ... \n",
431
+ "851605 math.AG math.CT math.RT \n",
432
+ "851606 hep-th gr-qc \n",
433
+ "851607 hep-th gr-qc math-ph math.GR math.MP math.RT \n",
434
+ "851608 cond-mat.supr-con cond-mat.mes-hall \n",
435
+ "851609 math.AP \n",
436
+ "\n",
437
+ " license \n",
438
+ "0 http://creativecommons.org/licenses/by-nc-sa/4.0/ \\\n",
439
+ "1 http://creativecommons.org/licenses/by/4.0/ \n",
440
+ "2 http://arxiv.org/licenses/nonexclusive-distrib... \n",
441
+ "3 http://arxiv.org/licenses/nonexclusive-distrib... \n",
442
+ "4 http://arxiv.org/licenses/nonexclusive-distrib... \n",
443
+ "... ... \n",
444
+ "851605 http://arxiv.org/licenses/nonexclusive-distrib... \n",
445
+ "851606 http://arxiv.org/licenses/nonexclusive-distrib... \n",
446
+ "851607 http://arxiv.org/licenses/nonexclusive-distrib... \n",
447
+ "851608 http://arxiv.org/licenses/nonexclusive-distrib... \n",
448
+ "851609 None \n",
449
+ "\n",
450
+ " abstract \n",
451
+ "0 Unmanned aerial vehicle (UAV) communications... \\\n",
452
+ "1 The concepts of blameworthiness and wrongnes... \n",
453
+ "2 This survey provides a brief account for the... \n",
454
+ "3 We use the phase plane analysis technique of... \n",
455
+ "4 Researchers are often interested in analyzin... \n",
456
+ "... ... \n",
457
+ "851605 We recall P. Balmer's definition of tensor t... \n",
458
+ "851606 Criticality represents a specific point in t... \n",
459
+ "851607 This thesis is devoted to the group-theoreti... \n",
460
+ "851608 We derive microscopically the dynamics assoc... \n",
461
+ "851609 We provide a mathematical analysis of appear... \n",
462
+ "\n",
463
+ " versions update_date \n",
464
+ "0 [{'created': 'Thu, 2 Jul 2020 06:26:20 GMT', '... 2020-07-03 \\\n",
465
+ "1 [{'created': 'Sun, 24 Jan 2021 21:37:35 GMT', ... 2021-02-09 \n",
466
+ "2 [{'created': 'Fri, 27 Jan 2012 15:24:46 GMT', ... 2012-02-21 \n",
467
+ "3 [{'created': 'Tue, 10 Nov 2015 12:08:23 GMT', ... 2016-04-27 \n",
468
+ "4 [{'created': 'Mon, 9 Oct 2017 06:34:01 GMT', '... 2020-08-25 \n",
469
+ "... ... ... \n",
470
+ "851605 [{'created': 'Fri, 4 Jan 2013 11:06:40 GMT', '... 2015-10-02 \n",
471
+ "851606 [{'created': 'Sun, 2 Jul 2017 19:52:32 GMT', '... 2018-11-21 \n",
472
+ "851607 [{'created': 'Wed, 26 Oct 2016 20:00:16 GMT', ... 2018-01-29 \n",
473
+ "851608 [{'created': 'Tue, 27 Nov 2012 17:29:04 GMT', ... 2013-05-29 \n",
474
+ "851609 [{'created': 'Sun, 20 May 2007 17:43:39 GMT', ... 2007-05-23 \n",
475
+ "\n",
476
+ " authors_parsed \n",
477
+ "0 [[Song, Qingheng, ], [Zeng, Yong, ], [Xu, Jie,... \\\n",
478
+ "1 [[Stuart, Michael T., ], [Kneer, Markus, ]] \n",
479
+ "2 [[Jerome, Denis, ]] \n",
480
+ "3 [[Ellis, George F. R., ], [Platts, Emma, ], [S... \n",
481
+ "4 [[Bansak, Kirk, ]] \n",
482
+ "... ... \n",
483
+ "851605 [[Klein, Sebastian, ]] \n",
484
+ "851606 [[Anastasiou, Giorgos, ], [Olea, Rodrigo, ], [... \n",
485
+ "851607 [[Oblak, Blagoje, ]] \n",
486
+ "851608 [[Joyez, Philippe, ]] \n",
487
+ "851609 [[Perthame, Benoit, , DMA], [Souganidis, Panag... \n",
488
+ "\n",
489
+ " cleaned_abstracts len_abstract \n",
490
+ "0 unmanned aerial vehicle uav communication a... 865 \n",
491
+ "1 concept blameworthiness wrongness fundament... 739 \n",
492
+ "2 survey provide brief account start organic ... 649 \n",
493
+ "3 use phase plane analysis technique madsen e... 554 \n",
494
+ "4 researcher interested analyze conditional t... 799 \n",
495
+ "... ... ... \n",
496
+ "851605 recall p. balmer definition tensor triangul... 787 \n",
497
+ "851606 criticality represent specific point parame... 631 \n",
498
+ "851607 thesis devoted group theoretic aspect dimen... 542 \n",
499
+ "851608 derive microscopically dynamic associate d.... 558 \n",
500
+ "851609 provide mathematical analysis appearance co... 518 \n",
501
+ "\n",
502
+ "[851610 rows x 16 columns]"
503
+ ]
504
+ },
505
+ "execution_count": 74,
506
+ "metadata": {},
507
+ "output_type": "execute_result"
508
+ }
509
+ ],
510
+ "source": [
511
+ "df"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 75,
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "corpus = df['cleaned_abstracts'].to_list()\n"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": 76,
526
+ "metadata": {},
527
+ "outputs": [],
528
+ "source": [
529
+ "def gensim_tokenizer(docs: List[str]):\n",
530
+ " tokenized_docs = list()\n",
531
+ " for doc in docs:\n",
532
+ " processed_words = preprocess_string(doc, cleaning_filters)\n",
533
+ " tokenized_docs.append(processed_words)\n",
534
+ " \n",
535
+ " return tokenized_docs\n",
536
+ "\n",
537
+ "tokenized_corpus = gensim_tokenizer(corpus)"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 77,
543
+ "metadata": {},
544
+ "outputs": [
545
+ {
546
+ "data": {
547
+ "text/plain": [
548
+ "851610"
549
+ ]
550
+ },
551
+ "execution_count": 77,
552
+ "metadata": {},
553
+ "output_type": "execute_result"
554
+ }
555
+ ],
556
+ "source": [
557
+ "len(tokenized_corpus)"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 55,
563
+ "metadata": {},
564
+ "outputs": [],
565
+ "source": [
566
+ "def cleaning_pipe(document):\n",
567
+ " # Invoking gensim.parsing.preprocess_string method with set of filters\n",
568
+ " processed_words = preprocess_string(document, cleaning_filters)\n",
569
+ " return processed_words"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 78,
575
+ "metadata": {},
576
+ "outputs": [
577
+ {
578
+ "name": "stdout",
579
+ "output_type": "stream",
580
+ "text": [
581
+ "/Users/luis.morales/Desktop/arxiv-paper-recommender/models\n",
582
+ "/Users/luis.morales/Desktop/arxiv-paper-recommender\n"
583
+ ]
584
+ },
585
+ {
586
+ "data": {
587
+ "text/plain": [
588
+ "False"
589
+ ]
590
+ },
591
+ "execution_count": 78,
592
+ "metadata": {},
593
+ "output_type": "execute_result"
594
+ }
595
+ ],
596
+ "source": [
597
+ "def validate_if_dictionary_exists(dictionaty_name: str) -> bool:\n",
598
+ " dicts_folder = \"models/nlp_dictionaries\"\n",
599
+ " current_dir = os.getcwd()\n",
600
+ " parent_dir = os.path.dirname(current_dir)\n",
601
+ " dict_path = f\"{parent_dir}/{dicts_folder}/{dictionaty_name}\"\n",
602
+ " print(current_dir)\n",
603
+ " print(parent_dir)\n",
604
+ " return os.path.isfile(dict_path)\n",
605
+ " \n",
606
+ "\n",
607
+ "validate_if_dictionary_exists('30ktokens.dict') "
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": 79,
613
+ "metadata": {},
614
+ "outputs": [],
615
+ "source": [
616
+ "def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = \"corpus\", save_dict: bool = False):\n",
617
+ " \"\"\"\n",
618
+ " Create dictionary of words in preprocessed corpus and saves the dict object\n",
619
+ " \"\"\"\n",
620
+ " dictionary = corpora.Dictionary(tokenized_docs)\n",
621
+ " if save_dict: \n",
622
+ " dict_lenght = len(tokenized_corpus)\n",
623
+ " parent_folder = \"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/dictionaries\"\n",
624
+ " #if validate_if_dictionary_exists('30ktokens.dict'):\n",
625
+ " dictionary.save(f'{parent_folder}/{dict_name}.dict')\n",
626
+ " \n",
627
+ " return dictionary\n",
628
+ "\n",
629
+ "dictionary = get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=\"TextualTango\", save_dict=True)"
630
+ ]
631
+ },
632
+ {
633
+ "cell_type": "code",
634
+ "execution_count": 31,
635
+ "metadata": {},
636
+ "outputs": [],
637
+ "source": [
638
+ "# def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = \"corpus\", save_dict: bool = False):\n",
639
+ "# \"\"\"\n",
640
+ "# Create dictionary of words in preprocessed corpus and saves the dict object\n",
641
+ "# \"\"\"\n",
642
+ "# dictionary = corpora.Dictionary(tokenized_docs)\n",
643
+ "# if save_dict: \n",
644
+ "# dict_lenght = len(tokenized_corpus)\n",
645
+ "# parent_folder = \"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries\"\n",
646
+ "# if validate_if_dictionary_exists('30ktokens.dict'):\n",
647
+ "# dictionary.save(f'{parent_folder}/{dict_name}.dict')\n",
648
+ " \n",
649
+ "# return dictionary\n",
650
+ "\n",
651
+ "# dictionary = get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=\"300Ktokens\", save_dict=True)"
652
+ ]
653
+ },
654
+ {
655
+ "cell_type": "code",
656
+ "execution_count": 80,
657
+ "metadata": {},
658
+ "outputs": [],
659
+ "source": [
660
+ "BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "code",
665
+ "execution_count": 81,
666
+ "metadata": {},
667
+ "outputs": [],
668
+ "source": [
669
+ "tfidf_model = TfidfModel(BoW_corpus)"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "code",
674
+ "execution_count": 82,
675
+ "metadata": {},
676
+ "outputs": [],
677
+ "source": [
678
+ "tfidf_model.save(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/TextualTango.model\")"
679
+ ]
680
+ },
681
+ {
682
+ "cell_type": "markdown",
683
+ "metadata": {},
684
+ "source": [
685
+ "## Test model"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 83,
691
+ "metadata": {},
692
+ "outputs": [],
693
+ "source": [
694
+ "# index the tfidf vector of corpus as sparse matrix\n",
695
+ "from gensim import similarities\n",
696
+ "index = similarities.SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary))"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 84,
702
+ "metadata": {},
703
+ "outputs": [],
704
+ "source": [
705
+ "index.save(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/TextualTangoSimilarities/TextualTango\")"
706
+ ]
707
+ },
708
+ {
709
+ "cell_type": "code",
710
+ "execution_count": 67,
711
+ "metadata": {},
712
+ "outputs": [],
713
+ "source": [
714
+ "def get_closest_n(query, n):\n",
715
+ " '''get the top matching docs as per cosine similarity\n",
716
+ " between tfidf vector of query and all docs'''\n",
717
+ " query_document = cleaning_pipe(query)\n",
718
+ " query_bow = dictionary.doc2bow(query_document)\n",
719
+ " sims = index[tfidf_model[query_bow]]\n",
720
+ " top_idx = sims.argsort()[-1*n:][::-1]\n",
721
+ " return top_idx"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": 85,
727
+ "metadata": {},
728
+ "outputs": [],
729
+ "source": [
730
+ "def get_recomendations_metadata(query: str, n: int, df: pd.DataFrame):\n",
731
+ " recommendations_idxs = get_closest_n(query, n)\n",
732
+ " return df.iloc[recommendations_idxs].reset_index(drop=True)"
733
+ ]
734
+ },
735
+ {
736
+ "cell_type": "code",
737
+ "execution_count": null,
738
+ "metadata": {},
739
+ "outputs": [],
740
+ "source": []
741
+ },
742
+ {
743
+ "cell_type": "code",
744
+ "execution_count": 91,
745
+ "metadata": {},
746
+ "outputs": [
747
+ {
748
+ "name": "stdout",
749
+ "output_type": "stream",
750
+ "text": [
751
+ "User Request ---- : \n",
752
+ " Which papers discuss the use of statistical models and Bayesian inference for uncertainty quantification and risk assessment in engineering systems?\n",
753
+ "User Request ---- : \n",
754
+ " \n",
755
+ "Title: A framework for benchmarking uncertainty in deep regression\n",
756
+ "Abstract: We propose a framework for the assessment of uncertainty quantification in\n",
757
+ "deep regression. The framework is based on regression problems where the\n",
758
+ "regression function is a linear combination of nonlinear functions. Basically,\n",
759
+ "any level of complexity can be realized through the choice of the nonlinear\n",
760
+ "functions and the dimensionality of their domain. Results of an uncertainty\n",
761
+ "quantification for deep regression are compared against those obtained by a\n",
762
+ "statistical reference method. The reference method utilizes knowledge of the\n",
763
+ "underlying nonlinear functions and is based on a Bayesian linear regression\n",
764
+ "using a reference prior. Reliability of uncertainty quantification is assessed\n",
765
+ "in terms of coverage probabilities, and accuracy through the size of calculated\n",
766
+ "uncertainties. We illustrate the proposed framework by applying it to current\n",
767
+ "approaches for uncertainty quantification in deep regression. The flexibility,\n",
768
+ "together with the availability of a reference solution, makes the framework\n",
769
+ "suitable for defining benchmark sets for uncertainty quantification.\n",
770
+ "\n",
771
+ "\n",
772
+ "--------------------------\n",
773
+ "User Request ---- : \n",
774
+ " Which papers discuss the use of statistical models and Bayesian inference for uncertainty quantification and risk assessment in engineering systems?\n",
775
+ "User Request ---- : \n",
776
+ " \n",
777
+ "Title: Generative Parameter Sampler For Scalable Uncertainty Quantification\n",
778
+ "Abstract: Uncertainty quantification has been a core of the statistical machine\n",
779
+ "learning, but its computational bottleneck has been a serious challenge for\n",
780
+ "both Bayesians and frequentists. We propose a model-based framework in\n",
781
+ "quantifying uncertainty, called predictive-matching Generative Parameter\n",
782
+ "Sampler (GPS). This procedure considers an Uncertainty Quantification (UQ)\n",
783
+ "distribution on the targeted parameter, which matches the corresponding\n",
784
+ "predictive distribution to the observed data. This framework adopts a\n",
785
+ "hierarchical modeling perspective such that each observation is modeled by an\n",
786
+ "individual parameter. This individual parameterization permits the resulting\n",
787
+ "inference to be computationally scalable and robust to outliers. Our approach\n",
788
+ "is illustrated for linear models, Poisson processes, and deep neural networks\n",
789
+ "for classification. The results show that the GPS is successful in providing\n",
790
+ "uncertainty quantification as well as additional flexibility beyond what is\n",
791
+ "allowed by classical statistical procedures under the postulated statistical\n",
792
+ "models.\n",
793
+ "\n",
794
+ "\n",
795
+ "--------------------------\n",
796
+ "User Request ---- : \n",
797
+ " Which papers discuss the use of statistical models and Bayesian inference for uncertainty quantification and risk assessment in engineering systems?\n",
798
+ "User Request ---- : \n",
799
+ " \n",
800
+ "Title: Recent Advances in Uncertainty Quantification Methods for Engineering\n",
801
+ " Problems\n",
802
+ "Abstract: In the last few decades, uncertainty quantification (UQ) methods have been\n",
803
+ "used widely to ensure the robustness of engineering designs. This chapter aims\n",
804
+ "to detail recent advances in popular uncertainty quantification methods used in\n",
805
+ "engineering applications. This chapter describes the two most popular\n",
806
+ "meta-modeling methods for uncertainty quantification suitable for engineering\n",
807
+ "applications (Polynomial Chaos Method and Gaussian Process). Further, the UQ\n",
808
+ "methods are applied to an engineering test problem under multiple\n",
809
+ "uncertainties. The test problem considered here is a supersonic nozzle under\n",
810
+ "operational uncertainties. For the deterministic solution, an open-source\n",
811
+ "computational fluid dynamics (CFD) solver SU2 is used. The UQ methods are\n",
812
+ "developed in Matlab and are further combined with SU2 for the uncertainty and\n",
813
+ "sensitivity estimates. The results are presented in terms of the mean and\n",
814
+ "standard deviation of the output quantities.\n",
815
+ "\n",
816
+ "\n",
817
+ "--------------------------\n"
818
+ ]
819
+ }
820
+ ],
821
+ "source": [
822
+ "_input = \"Which papers discuss the use of statistical models and Bayesian inference for uncertainty quantification and risk assessment in engineering systems?\"\n",
823
+ "results_df = get_recomendations_metadata(query=_input, df=df, n=3)\n",
824
+ "\n",
825
+ "\n",
826
+ "for abstract in list(zip(results_df['abstract'].to_list(), results_df['title'].to_list())):\n",
827
+ " print(f\"User Request ---- : \\n {_input}\")\n",
828
+ " print(f\"User Request ---- : \\n \")\n",
829
+ " print(f\"Title: {abstract[1]}\")\n",
830
+ " print(f\"Abstract: {abstract[0]}\\n\")\n",
831
+ " print(f\"--------------------------\")"
832
+ ]
833
+ }
834
+ ],
835
+ "metadata": {
836
+ "kernelspec": {
837
+ "display_name": "Python 3.11.4 ('arxiv-env': venv)",
838
+ "language": "python",
839
+ "name": "python3"
840
+ },
841
+ "language_info": {
842
+ "codemirror_mode": {
843
+ "name": "ipython",
844
+ "version": 3
845
+ },
846
+ "file_extension": ".py",
847
+ "mimetype": "text/x-python",
848
+ "name": "python",
849
+ "nbconvert_exporter": "python",
850
+ "pygments_lexer": "ipython3",
851
+ "version": "3.11.4"
852
+ },
853
+ "orig_nbformat": 4,
854
+ "vscode": {
855
+ "interpreter": {
856
+ "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363"
857
+ }
858
+ }
859
+ },
860
+ "nbformat": 4,
861
+ "nbformat_minor": 2
862
+ }