GrammarGuru / src /models /utils /utilities.py
lewispons's picture
Initial Setup
2d4243e
import yaml
import time
from typing import Dict, Union
import pandas as pd
import spacy
import os
def read_yaml_config(file_path: str) -> Dict:
"""
Reads a YAML configuration file and returns the loaded configuration as a dictionary.
Args:
file_path (str): The path to the YAML configuration file.
"""
with open(file_path, 'r') as file:
config = yaml.safe_load(file)
return config
def validate_and_create_subfolders(
model_name: str,
parent_subfolder: str = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models"
):
model_subfolders = ["data", "dictionaries", "similarities_matrix", "tdidf"]
if not os.path.exists(f"{parent_subfolder}/{model_name}"):
os.makedirs(f"{parent_subfolder}/{model_name}")
for msubfolder in model_subfolders:
os.makedirs(f"{parent_subfolder}/{model_name}/{msubfolder}")
def execution_time(func):
"""
Decorator that measures the execution time of a function and prints the elapsed time.
"""
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
execution_seconds = end_time - start_time
print(f"Function '{func.__name__}' executed in {execution_seconds:.4f} seconds.")
return result
return wrapper
def cleanData(doc: Union[pd.Series, str], nlp = spacy.load('en_core_web_sm')):
"""
TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs.
Cleans and processes the input documents by performing various text cleaning operations.
Args:
doc (pd.Series): The documents to be cleaned, passed in a Series object.
stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False.
Returns:
str: The cleaned and processed document as a single string.
"""
doc = doc.lower()
doc = nlp(doc)
tokens = [tokens.lower_ for tokens in doc]
tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
final_token = [token.lemma_ for token in tokens]
return " ".join(final_token)