Spaces:

TDLI2024
/

Aru_Thaqana

Sleeping

File size: 2,309 Bytes

29d61ac

import os
import re
import unicodedata

def strip_accents(text: str) -> str:
    """Removes accents from text."""
    return ''.join(c for c in unicodedata.normalize('NFD', text)
                  if unicodedata.category(c) != 'Mn')


def load_raw_text(corpus_directory: str, file_names=None) -> str:
    """Loads all the text files in a directory into one large string"""
    corpus = ""
    
    for file_name in os.listdir(corpus_directory):
        # Read the file as a string
        file_path = os.path.join(corpus_directory, file_name)
        if os.path.isdir(file_path):
            continue
            
        #  Make sure we only read text files
        if ".txt" not in file_name:
            continue
            
        with open(file_path, 'r') as file:
            file_contents = file.read()
            corpus += (file_contents + "\n")
    return corpus

def load_single_raw_text_file(file_name):
    """Loads a single text file into one large string"""

    corpus = ""
    with open(file_name, 'r') as file:
        file_contents = file.read()
        corpus += (file_contents + "\n")

    return corpus


word_regex = r"[\w|\']+"
def tokenize(text):
    return re.findall(word_regex, text)


def preprocess(text):
    """Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
    text = strip_accents(text)
    text = text.lower()

    tokens = text.split(" ")

    tokens_filtered = []
    for token in tokens:
        # Skip any tokens with special characters
        if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
            tokens_filtered.append(token)
    return tokens_filtered


def pad(text: list, num_padding: int):
    """Pads the given text, as a list of strings, with <s> characters between sentences."""
    padded_text = []
    
    # Add initial padding to the first sentence
    for _ in range(num_padding):
        padded_text.append("<s>")
    
    for word in text:
        padded_text.append(word)

        # Every time we see an end punctuation mark, add <s> tokens before it
        # REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
        if word in [".", "?", "!"]:
            for _ in range(num_padding):
                padded_text.append("<s>")
        
        
    return padded_text