File size: 4,040 Bytes
2d4243e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
from gensim.similarities import SparseMatrixSimilarity
import argparse
import logging
import time

from utils.utilities import read_yaml_config, validate_and_create_subfolders
from utils.mlutilities import *

import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler(sys.stdout)
    ]
)


model_configurations = read_yaml_config("/Users/luis.morales/Desktop/arxiv-paper-recommender/src/models/configs.yaml")


if __name__ == "__main__":
    """
        Example: 
        python3 ./src/models/train_recommender.py --modelsize Medium

    """
    # Define and parse command-line arguments
    parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI')
    parser.add_argument('--modelsize',choices=["Large", "SubLarge", "Medium", "Small"],  default=None, type=str, help='Model Size')

    args = parser.parse_args()
    model_size = args.modelsize
    start = time.time()
    
    
    if model_size is None:
        raise Exception("The `modelsize` flag was no passed to the CLI.")
    
    
    model_config = model_configurations["GensimConfig"][model_size]
    model_name = model_configurations["GensimConfig"][model_size]["ModelName"]
    dataset_fraq_split = model_configurations["GensimConfig"][model_size]["DataSetFracSplit"]
    random_seed = model_configurations["GensimConfig"][model_size]["RandomSeedSplit"]
    logging.info(f"Started training of {model_name} Model.")
    

    validate_and_create_subfolders(
        model_name=model_name
    )
    logging.info(f"Model Folder `{model_name}` was created successfully.")
    
    
    if dataset_fraq_split is None:
        df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip")
        logging.info(f"The full text Corpus was readed.")
        
    else :
        df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip") \
            .sample(frac=dataset_fraq_split, random_state=random_seed) \
            .reset_index(drop=True)
        logging.info(f"A random split of {dataset_fraq_split}% was applied on the Text Corpus ")
    logging.info(f"Dimensions of the dataset: {df.shape}")
    
    df.to_parquet(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip", compression='gzip')
    logging.info(f"The Dataset used for this training was successfully saved in: `/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip`.")
    
    

    corpus = df['cleaned_abstracts'].to_list()
    tokenized_corpus = gensim_tokenizer(corpus)
    logging.info(f"Dictionary Learned on the {model_name} corpus dataset.")
    
    
    dictionary = get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=model_name, save_dict=True)
    logging.info("Dictionary Saved Locally.")
    
    
    BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]
    tfidf_model = TfidfModel(BoW_corpus)
    logging.info(f"TD-IDF {model_name} Model was successfully trained.")
    
    
    tfidf_model.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/{model_name}.model")
    logging.info(f"Model: {model_name} was successfully saved.")


    index = SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary))
    logging.info(f"The Similarities Sparse Matrix was successfully created.")
    index.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/{model_name}")
    logging.info(f"The Similarities Matrix was successfully saved for the model: {model_name}.")
    
    end = time.time()
    total_time = end - start
    logging.info(f"Full Training of {model_size} model took {total_time} secs.")
    logging.info(f"The {model_name} Model was successfully trained! yei :)")