Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_selection import ColumnSelector | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from utils.utilities import * | |
import sys | |
from pprint import pprint | |
CONFIG_FILE_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/configs.yaml" | |
config = read_yaml_config(CONFIG_FILE_PATH) | |
pprint(config) | |
def train_tfidf(): | |
df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip") \ | |
.sample(500000) \ | |
.reset_index(drop=True) | |
vectorizer = TfidfVectorizer(**config["models"]["tfidf"]["tfidf_deffault"]) | |
pprint(config["models"]["tfidf"]["tfidf_deffault"]) | |
sys.exit() | |
vectors = vectorizer.fit_transform(df['cleaned_abstracts']) | |
tfidf_df = pd.DataFrame(vectors.toarray(), columns=[i for i in vectorizer.get_feature_names_out()]) | |
tfidf_df.to_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_tfidf.parquet.gzip") | |
train_tfidf() | |