File size: 2,261 Bytes
2d4243e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import yaml
import time
from typing import Dict, Union
import pandas as pd
import spacy
import os


def read_yaml_config(file_path: str) -> Dict:
    """
    Reads a YAML configuration file and returns the loaded configuration as a dictionary.

    Args:
        file_path (str): The path to the YAML configuration file.
    """
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    return config



def validate_and_create_subfolders(
    model_name: str, 
    parent_subfolder: str = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models"
):
    model_subfolders = ["data", "dictionaries", "similarities_matrix", "tdidf"]
    
    if not os.path.exists(f"{parent_subfolder}/{model_name}"):
        os.makedirs(f"{parent_subfolder}/{model_name}")
        for msubfolder in model_subfolders:
            os.makedirs(f"{parent_subfolder}/{model_name}/{msubfolder}")
                
                




def execution_time(func):
    """
    Decorator that measures the execution time of a function and prints the elapsed time.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_seconds = end_time - start_time
        print(f"Function '{func.__name__}' executed in {execution_seconds:.4f} seconds.")
        return result
    return wrapper


def cleanData(doc: Union[pd.Series, str], nlp = spacy.load('en_core_web_sm')):
    """
    TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs.
    
    Cleans and processes the input documents by performing various text cleaning operations.

    Args:
        doc (pd.Series): The documents to be cleaned, passed in a Series object.
        stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False.

    Returns:
        str: The cleaned and processed document as a single string.
    """
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)