import yaml import time from typing import Dict, Union import pandas as pd import spacy import os def read_yaml_config(file_path: str) -> Dict: """ Reads a YAML configuration file and returns the loaded configuration as a dictionary. Args: file_path (str): The path to the YAML configuration file. """ with open(file_path, 'r') as file: config = yaml.safe_load(file) return config def validate_and_create_subfolders( model_name: str, parent_subfolder: str = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models" ): model_subfolders = ["data", "dictionaries", "similarities_matrix", "tdidf"] if not os.path.exists(f"{parent_subfolder}/{model_name}"): os.makedirs(f"{parent_subfolder}/{model_name}") for msubfolder in model_subfolders: os.makedirs(f"{parent_subfolder}/{model_name}/{msubfolder}") def execution_time(func): """ Decorator that measures the execution time of a function and prints the elapsed time. """ def wrapper(*args, **kwargs): start_time = time.time() result = func(*args, **kwargs) end_time = time.time() execution_seconds = end_time - start_time print(f"Function '{func.__name__}' executed in {execution_seconds:.4f} seconds.") return result return wrapper def cleanData(doc: Union[pd.Series, str], nlp = spacy.load('en_core_web_sm')): """ TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs. Cleans and processes the input documents by performing various text cleaning operations. Args: doc (pd.Series): The documents to be cleaned, passed in a Series object. stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False. Returns: str: The cleaned and processed document as a single string. """ doc = doc.lower() doc = nlp(doc) tokens = [tokens.lower_ for tokens in doc] tokens = [tokens for tokens in doc if (tokens.is_stop == False)] tokens = [tokens for tokens in tokens if (tokens.is_punct == False)] final_token = [token.lemma_ for token in tokens] return " ".join(final_token)