{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 13 µs, sys: 0 ns, total: 13 µs\n", "Wall time: 20.3 µs\n" ] } ], "source": [ "%%time\n", "import pandas as pd\n", "import json\n", "from tqdm import tqdm\n", "import pyarrow as pa\n", "import pyarrow.parquet as pq" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2268252it [04:21, 8672.99it/s] \n" ] } ], "source": [ "# df = pd.read_json(\"../data/raw/arxiv-metadata-oai-snapshot.json\")\n", "\n", "# Initialize an empty list to store the data from the JSON file\n", "arxiv_data = []\n", "\n", "# Open the JSON file using the 'with' statement, ensuring it's closed automatically\n", "with open(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/raw/arxiv-metadata-oai-snapshot.json\", 'r') as file:\n", " # Iterate over each line in the file\n", " for line in tqdm(file):\n", " # Load the JSON data from each line and append it to the arxiv_data list\n", " arxiv_data.append(json.loads(line))\n", "\n", "# Create a DataFrame from the arxiv_data list using the pd.DataFrame.from_records() method\n", "df = pd.DataFrame.from_records(arxiv_data)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "submitter | \n", "authors | \n", "title | \n", "comments | \n", "journal-ref | \n", "doi | \n", "report-no | \n", "categories | \n", "license | \n", "abstract | \n", "versions | \n", "update_date | \n", "authors_parsed | \n", "
0 | \n", "0704.0001 | \n", "Pavel Nadolsky | \n", "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", "Calculation of prompt diphoton production cros... | \n", "37 pages, 15 figures; published version | \n", "Phys.Rev.D76:013009,2007 | \n", "10.1103/PhysRevD.76.013009 | \n", "ANL-HEP-PR-07-12 | \n", "hep-ph | \n", "None | \n", "A fully differential calculation in perturba... | \n", "[{'version': 'v1', 'created': 'Mon, 2 Apr 2007... | \n", "2008-11-26 | \n", "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", "
1 | \n", "0704.0002 | \n", "Louis Theran | \n", "Ileana Streinu and Louis Theran | \n", "Sparsity-certifying Graph Decompositions | \n", "To appear in Graphs and Combinatorics | \n", "None | \n", "None | \n", "None | \n", "math.CO cs.CG | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We describe a new algorithm, the $(k,\\ell)$-... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Mar 200... | \n", "2008-12-13 | \n", "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", "
2 | \n", "0704.0003 | \n", "Hongjun Pan | \n", "Hongjun Pan | \n", "The evolution of the Earth-Moon system based o... | \n", "23 pages, 3 figures | \n", "None | \n", "None | \n", "None | \n", "physics.gen-ph | \n", "None | \n", "The evolution of Earth-Moon system is descri... | \n", "[{'version': 'v1', 'created': 'Sun, 1 Apr 2007... | \n", "2008-01-13 | \n", "[[Pan, Hongjun, ]] | \n", "
3 | \n", "0704.0004 | \n", "David Callan | \n", "David Callan | \n", "A determinant of Stirling cycle numbers counts... | \n", "11 pages | \n", "None | \n", "None | \n", "None | \n", "math.CO | \n", "None | \n", "We show that a determinant of Stirling cycle... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Mar 200... | \n", "2007-05-23 | \n", "[[Callan, David, ]] | \n", "
4 | \n", "0704.0005 | \n", "Alberto Torchinsky | \n", "Wael Abu-Shammala and Alberto Torchinsky | \n", "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", "None | \n", "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", "None | \n", "None | \n", "math.CA math.FA | \n", "None | \n", "In this paper we show how to compute the $\\L... | \n", "[{'version': 'v1', 'created': 'Mon, 2 Apr 2007... | \n", "2013-10-15 | \n", "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2268247 | \n", "supr-con/9608008 | \n", "Ruslan Prozorov | \n", "R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... | \n", "On the origin of the irreversibility line in t... | \n", "19 pages, LaTex, 6 PostScript figures; Author'... | \n", "None | \n", "10.1103/PhysRevB.54.15530 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "We report on measurements of the angular dep... | \n", "[{'version': 'v1', 'created': 'Mon, 26 Aug 199... | \n", "2009-10-30 | \n", "[[Prozorov, R., ], [Konczykowski, M., ], [Schm... | \n", "
2268248 | \n", "supr-con/9609001 | \n", "Durga P. Choudhury | \n", "Durga P. Choudhury, Balam A. Willemsen, John S... | \n", "Nonlinear Response of HTSC Thin Film Microwave... | \n", "4 pages, LaTeX type, Uses IEEE style files, 60... | \n", "None | \n", "10.1109/77.620744 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The non-linear microwave surface impedance o... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Aug 199... | \n", "2016-11-18 | \n", "[[Choudhury, Durga P., , Physics Department, N... | \n", "
2268249 | \n", "supr-con/9609002 | \n", "Durga P. Choudhury | \n", "Balam A. Willemsen, J. S. Derov and S.Sridhar ... | \n", "Critical State Flux Penetration and Linear Mic... | \n", "20 pages, LaTeX type, Uses REVTeX style files,... | \n", "None | \n", "10.1103/PhysRevB.56.11989 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The vortex contribution to the dc field (H) ... | \n", "[{'version': 'v1', 'created': 'Tue, 3 Sep 1996... | \n", "2009-10-30 | \n", "[[Willemsen, Balam A., , Physics Department,\\n... | \n", "
2268250 | \n", "supr-con/9609003 | \n", "Hasegawa Yasumasa | \n", "Yasumasa Hasegawa (Himeji Institute of Technol... | \n", "Density of States and NMR Relaxation Rate in A... | \n", "7 pages, 4 PostScript Figures, LaTeX, to appea... | \n", "None | \n", "10.1143/JPSJ.65.3131 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "We show that the density of states in an ani... | \n", "[{'version': 'v1', 'created': 'Wed, 18 Sep 199... | \n", "2009-10-30 | \n", "[[Hasegawa, Yasumasa, , Himeji Institute of Te... | \n", "
2268251 | \n", "supr-con/9609004 | \n", "Masanori Ichioka | \n", "Naoki Enomoto, Masanori Ichioka and Kazushige ... | \n", "Ginzburg Landau theory for d-wave pairing and ... | \n", "12 pages including 8 eps figs, LaTeX with jpsj... | \n", "J. Phys. Soc. Jpn. 66, 204 (1997). | \n", "10.1143/JPSJ.66.204 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The Ginzburg Landau theory for d_{x^2-y^2}-w... | \n", "[{'version': 'v1', 'created': 'Wed, 25 Sep 199... | \n", "2009-10-30 | \n", "[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... | \n", "
2268252 rows × 14 columns
\n", "\n", " | id | \n", "submitter | \n", "authors | \n", "title | \n", "comments | \n", "journal-ref | \n", "doi | \n", "report-no | \n", "categories | \n", "license | \n", "abstract | \n", "versions | \n", "update_date | \n", "authors_parsed | \n", "
0 | \n", "0704.0001 | \n", "Pavel Nadolsky | \n", "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... | \n", "Calculation of prompt diphoton production cros... | \n", "37 pages, 15 figures; published version | \n", "Phys.Rev.D76:013009,2007 | \n", "10.1103/PhysRevD.76.013009 | \n", "ANL-HEP-PR-07-12 | \n", "hep-ph | \n", "None | \n", "A fully differential calculation in perturba... | \n", "[{'version': 'v1', 'created': 'Mon, 2 Apr 2007... | \n", "2008-11-26 | \n", "[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... | \n", "
1 | \n", "0704.0002 | \n", "Louis Theran | \n", "Ileana Streinu and Louis Theran | \n", "Sparsity-certifying Graph Decompositions | \n", "To appear in Graphs and Combinatorics | \n", "None | \n", "None | \n", "None | \n", "math.CO cs.CG | \n", "http://arxiv.org/licenses/nonexclusive-distrib... | \n", "We describe a new algorithm, the $(k,\\ell)$-... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Mar 200... | \n", "2008-12-13 | \n", "[[Streinu, Ileana, ], [Theran, Louis, ]] | \n", "
2 | \n", "0704.0003 | \n", "Hongjun Pan | \n", "Hongjun Pan | \n", "The evolution of the Earth-Moon system based o... | \n", "23 pages, 3 figures | \n", "None | \n", "None | \n", "None | \n", "physics.gen-ph | \n", "None | \n", "The evolution of Earth-Moon system is descri... | \n", "[{'version': 'v1', 'created': 'Sun, 1 Apr 2007... | \n", "2008-01-13 | \n", "[[Pan, Hongjun, ]] | \n", "
3 | \n", "0704.0004 | \n", "David Callan | \n", "David Callan | \n", "A determinant of Stirling cycle numbers counts... | \n", "11 pages | \n", "None | \n", "None | \n", "None | \n", "math.CO | \n", "None | \n", "We show that a determinant of Stirling cycle... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Mar 200... | \n", "2007-05-23 | \n", "[[Callan, David, ]] | \n", "
4 | \n", "0704.0005 | \n", "Alberto Torchinsky | \n", "Wael Abu-Shammala and Alberto Torchinsky | \n", "From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... | \n", "None | \n", "Illinois J. Math. 52 (2008) no.2, 681-689 | \n", "None | \n", "None | \n", "math.CA math.FA | \n", "None | \n", "In this paper we show how to compute the $\\L... | \n", "[{'version': 'v1', 'created': 'Mon, 2 Apr 2007... | \n", "2013-10-15 | \n", "[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2268247 | \n", "supr-con/9608008 | \n", "Ruslan Prozorov | \n", "R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y... | \n", "On the origin of the irreversibility line in t... | \n", "19 pages, LaTex, 6 PostScript figures; Author'... | \n", "None | \n", "10.1103/PhysRevB.54.15530 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "We report on measurements of the angular dep... | \n", "[{'version': 'v1', 'created': 'Mon, 26 Aug 199... | \n", "2009-10-30 | \n", "[[Prozorov, R., ], [Konczykowski, M., ], [Schm... | \n", "
2268248 | \n", "supr-con/9609001 | \n", "Durga P. Choudhury | \n", "Durga P. Choudhury, Balam A. Willemsen, John S... | \n", "Nonlinear Response of HTSC Thin Film Microwave... | \n", "4 pages, LaTeX type, Uses IEEE style files, 60... | \n", "None | \n", "10.1109/77.620744 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The non-linear microwave surface impedance o... | \n", "[{'version': 'v1', 'created': 'Sat, 31 Aug 199... | \n", "2016-11-18 | \n", "[[Choudhury, Durga P., , Physics Department, N... | \n", "
2268249 | \n", "supr-con/9609002 | \n", "Durga P. Choudhury | \n", "Balam A. Willemsen, J. S. Derov and S.Sridhar ... | \n", "Critical State Flux Penetration and Linear Mic... | \n", "20 pages, LaTeX type, Uses REVTeX style files,... | \n", "None | \n", "10.1103/PhysRevB.56.11989 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The vortex contribution to the dc field (H) ... | \n", "[{'version': 'v1', 'created': 'Tue, 3 Sep 1996... | \n", "2009-10-30 | \n", "[[Willemsen, Balam A., , Physics Department,\\n... | \n", "
2268250 | \n", "supr-con/9609003 | \n", "Hasegawa Yasumasa | \n", "Yasumasa Hasegawa (Himeji Institute of Technol... | \n", "Density of States and NMR Relaxation Rate in A... | \n", "7 pages, 4 PostScript Figures, LaTeX, to appea... | \n", "None | \n", "10.1143/JPSJ.65.3131 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "We show that the density of states in an ani... | \n", "[{'version': 'v1', 'created': 'Wed, 18 Sep 199... | \n", "2009-10-30 | \n", "[[Hasegawa, Yasumasa, , Himeji Institute of Te... | \n", "
2268251 | \n", "supr-con/9609004 | \n", "Masanori Ichioka | \n", "Naoki Enomoto, Masanori Ichioka and Kazushige ... | \n", "Ginzburg Landau theory for d-wave pairing and ... | \n", "12 pages including 8 eps figs, LaTeX with jpsj... | \n", "J. Phys. Soc. Jpn. 66, 204 (1997). | \n", "10.1143/JPSJ.66.204 | \n", "None | \n", "supr-con cond-mat.supr-con | \n", "None | \n", "The Ginzburg Landau theory for d_{x^2-y^2}-w... | \n", "[{'version': 'v1', 'created': 'Wed, 25 Sep 199... | \n", "2009-10-30 | \n", "[[Enomoto, Naoki, , Okayama Univ.], [Ichioka, ... | \n", "
2268252 rows × 14 columns
\n", "