{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from gensim import corpora\n", "from gensim.similarities import SparseMatrixSimilarity\n", "from gensim.models import TfidfModel\n", "import pandas as pd\n", "import gensim\n", "import pprint\n", "from gensim import corpora\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import TfidfModel\n", "from gensim.parsing import strip_tags, strip_numeric, \\\n", " strip_multiple_whitespaces, stem_text, strip_punctuation, \\\n", " remove_stopwords, preprocess_string\n", "import re\n", "import os\n", "from typing import List" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'strip_tags' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m transform_to_lower \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mlambda\u001b[39;00m s: s\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 2\u001b[0m remove_single_char \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mlambda\u001b[39;00m s: re\u001b[38;5;241m.\u001b[39msub(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms+\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms+\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m, s)\n\u001b[1;32m 4\u001b[0m cleaning_filters \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m----> 5\u001b[0m \u001b[43mstrip_tags\u001b[49m,\n\u001b[1;32m 6\u001b[0m strip_numeric,\n\u001b[1;32m 7\u001b[0m strip_punctuation, \n\u001b[1;32m 8\u001b[0m strip_multiple_whitespaces, \n\u001b[1;32m 9\u001b[0m transform_to_lower,\n\u001b[1;32m 10\u001b[0m remove_stopwords,\n\u001b[1;32m 11\u001b[0m remove_single_char\n\u001b[1;32m 12\u001b[0m ]\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgensim_tokenizer\u001b[39m(docs: List[\u001b[38;5;28mstr\u001b[39m]):\n\u001b[1;32m 15\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;124;03m Tokenizes a list of strings using a series of cleaning filters.\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03m List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n", "\u001b[0;31mNameError\u001b[0m: name 'strip_tags' is not defined" ] } ], "source": [ "\n", "transform_to_lower = lambda s: s.lower()\n", "remove_single_char = lambda s: re.sub(r'\\s+\\w{1}\\s+', '', s)\n", "\n", "cleaning_filters = [\n", " strip_tags,\n", " strip_numeric,\n", " strip_punctuation, \n", " strip_multiple_whitespaces, \n", " transform_to_lower,\n", " remove_stopwords,\n", " remove_single_char\n", "]\n", "\n", "def gensim_tokenizer(docs: List[str]):\n", " \"\"\"\n", " Tokenizes a list of strings using a series of cleaning filters.\n", "\n", " Args:\n", " docs (List[str]): A list of strings to be tokenized.\n", "\n", " Returns:\n", " List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.\n", " \"\"\"\n", " tokenized_docs = list()\n", " for doc in docs:\n", " processed_words = preprocess_string(doc, cleaning_filters)\n", " tokenized_docs.append(processed_words)\n", " \n", " return tokenized_docs\n", "\n", "\n", "def cleaning_pipe(document):\n", " \"\"\"\n", " Applies a series of cleaning steps to a document.\n", "\n", " Args:\n", " document (str): The document to be cleaned.\n", "\n", " Returns:\n", " list: A list of processed words after applying the cleaning filters.\n", " \"\"\"\n", " # Invoking gensim.parsing.preprocess_string method with set of filters\n", " processed_words = preprocess_string(document, cleaning_filters)\n", " return processed_words\n", "\n", "\n", "def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = \"corpus\", save_dict: bool = False):\n", " \"\"\"\n", " Create dictionary of words in preprocessed corpus and saves the dict object\n", " \"\"\"\n", " dictionary = corpora.Dictionary(tokenized_docs)\n", " if save_dict: \n", " parent_folder = \"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries\"\n", " dictionary.save(f'{parent_folder}/{dict_name}.dict')\n", " return dictionary\n", "\n", "\n", "def get_closest_n(index_matrix: SparseMatrixSimilarity, query: str, n: int):\n", " '''\n", " Retrieves the top matching documents as per cosine similarity\n", " between the TF-IDF vector of the query and all documents.\n", "\n", " Args:\n", " query (str): The query string to find matching documents.\n", " n (int): The number of closest documents to retrieve.\n", "\n", " Returns:\n", " numpy.ndarray: An array of indices representing the top matching documents.\n", " '''\n", " # Clean the query document using cleaning_pipe function\n", " query_document = cleaning_pipe(query)\n", "\n", " # Convert the query document to bag-of-words representation\n", " query_bow = dictionary.doc2bow(query_document)\n", "\n", " # Calculate similarity scores between the query and all documents using TF-IDF model\n", " sims = index_matrix[index_matrix[query_bow]]\n", "\n", " # Get the indices of the top n closest documents based on similarity scores\n", " top_idx = sims.argsort()[-1 * n:][::-1]\n", "\n", " return top_idx\n", "\n", "\n", "def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int):\n", " '''\n", " Retrieves metadata recommendations based on a query using cosine similarity.\n", "\n", " Args:\n", " query (str): The query string for which recommendations are sought.\n", " n (int): The number of recommendations to retrieve.\n", " df (pd.DataFrame): The DataFrame containing metadata information.\n", "\n", " Returns:\n", " pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.\n", " '''\n", " # Get the indices of the closest matching documents based on the query\n", " recommendations_idxs = get_closest_n(query, n)\n", " \n", " # Retrieve the recommended metadata rows from the DataFrame based on the indices\n", " recommendations_metadata = df.iloc[recommendations_idxs]\n", " \n", " # Reset the index of the recommended metadata DataFrame\n", " recommendations_metadata = recommendations_metadata.reset_index(drop=True)\n", " \n", " return recommendations_metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = corpora.Dictionary.load()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "td_idf_model = TfidfModel.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/SemanticSherlock.model\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "similarities = SparseMatrixSimilarity.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/LanguageLiberatorSimilarities/LanguageLiberator\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " query = args.query\n", " \n", " df = pd.read_parquet(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip\")\n", " \n", " dict_corpus = Dictionary.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/dictionaries/LanguageLiberator.dict\")\n", " \n", " td_idf_model = TfidfModel.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/SemanticSherlock.model\")\n", " \n", " similarities = SparseMatrixSimilarity.load(\"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/LanguageLiberatorSimilarities/LanguageLiberator\")\n", " \n", " results_df = get_recomendations_metadata(query=query, df=df, n=3)\n", " print(results_df.head())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.11.4 ('arxiv-env': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "aae17c2ae2f38cc6f211be9b71a2aa280701d8462782cbc1f67caa83a1603363" } } }, "nbformat": 4, "nbformat_minor": 2 }