lewispons's picture
Initial Setup
2d4243e
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.utilities import *
import sys
from pprint import pprint
CONFIG_FILE_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/configs.yaml"
config = read_yaml_config(CONFIG_FILE_PATH)
pprint(config)
@execution_time
def train_tfidf():
df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip") \
.sample(500000) \
.reset_index(drop=True)
vectorizer = TfidfVectorizer(**config["models"]["tfidf"]["tfidf_deffault"])
pprint(config["models"]["tfidf"]["tfidf_deffault"])
sys.exit()
vectors = vectorizer.fit_transform(df['cleaned_abstracts'])
tfidf_df = pd.DataFrame(vectors.toarray(), columns=[i for i in vectorizer.get_feature_names_out()])
tfidf_df.to_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_tfidf.parquet.gzip")
train_tfidf()