import pandas as pd from gensim import corpora from gensim import similarities from gensim.models import TfidfModel from gensim.parsing import strip_tags, strip_numeric, \ strip_multiple_whitespaces, stem_text, strip_punctuation, \ remove_stopwords, preprocess_string import re from typing import List from utils.constants import TEST_INPUTS import argparse from random import choice transform_to_lower = lambda s: s.lower() remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s) class PaperRecommender: def __init__(self, num_samples=3000, corpus_dictionary_path="30Ktokens", arxiv_dataset_path="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip", save_dict=False, query=""): self.num_samples = num_samples self.corpus_dictionary_path = corpus_dictionary_path self.arxiv_dataset_path = arxiv_dataset_path self.save_dict = save_dict self.query = query self.cleaning_filters = [ strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, transform_to_lower, remove_stopwords, remove_single_char ] self.dictionary = None self.index = None self.tfidf_model = None self.df = None def gensim_tokenizer(self, docs: List[str]): tokenized_docs = list() for doc in docs: processed_words = preprocess_string(doc, self.cleaning_filters) tokenized_docs.append(processed_words) return tokenized_docs def cleaning_pipe(self, document): processed_words = preprocess_string(document, self.cleaning_filters) return processed_words def get_gensim_dictionary(self, tokenized_docs: List[str], dict_name: str = "corpus"): dictionary = corpora.Dictionary(tokenized_docs) if self.save_dict: parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries" dictionary.save(f'{parent_folder}/{dict_name}.dict') return dictionary def get_closest_n(self, query: str, n: int): query_document = self.cleaning_pipe(query) query_bow = self.dictionary.doc2bow(query_document) sims = self.index[self.tfidf_model[query_bow]] top_idx = sims.argsort()[-1 * n:][::-1] return top_idx def get_recommendations_metadata(self, query: str, n: int): recommendations_idxs = self.get_closest_n(query, n) recommendations_metadata = self.df.iloc[recommendations_idxs] recommendations_metadata = recommendations_metadata.reset_index(drop=True) return recommendations_metadata def run_recommender(self): if self.num_samples is None: self.df = pd.read_parquet(self.arxiv_dataset_path) self.df = pd.read_parquet(self.arxiv_dataset_path).sample(self.num_samples).reset_index(drop=True) corpus = self.df['cleaned_abstracts'].to_list() tokenized_corpus = self.gensim_tokenizer(corpus) self.dictionary = self.get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=self.corpus_dictionary_path) BoW_corpus = [self.dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus] self.tfidf_model = TfidfModel(BoW_corpus) self.index = similarities.SparseMatrixSimilarity(self.tfidf_model[BoW_corpus], num_features=len(self.dictionary)) if self.query is None: self.query = choice(TEST_INPUTS) return self.results