Aru_Thaqana / util.py
Alexis Palmer
Scramble para Aymara, first version
29d61ac
import os
import re
import unicodedata
def strip_accents(text: str) -> str:
"""Removes accents from text."""
return ''.join(c for c in unicodedata.normalize('NFD', text)
if unicodedata.category(c) != 'Mn')
def load_raw_text(corpus_directory: str, file_names=None) -> str:
"""Loads all the text files in a directory into one large string"""
corpus = ""
for file_name in os.listdir(corpus_directory):
# Read the file as a string
file_path = os.path.join(corpus_directory, file_name)
if os.path.isdir(file_path):
continue
# Make sure we only read text files
if ".txt" not in file_name:
continue
with open(file_path, 'r') as file:
file_contents = file.read()
corpus += (file_contents + "\n")
return corpus
def load_single_raw_text_file(file_name):
"""Loads a single text file into one large string"""
corpus = ""
with open(file_name, 'r') as file:
file_contents = file.read()
corpus += (file_contents + "\n")
return corpus
word_regex = r"[\w|\']+"
def tokenize(text):
return re.findall(word_regex, text)
def preprocess(text):
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
text = strip_accents(text)
text = text.lower()
tokens = text.split(" ")
tokens_filtered = []
for token in tokens:
# Skip any tokens with special characters
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
tokens_filtered.append(token)
return tokens_filtered
def pad(text: list, num_padding: int):
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
padded_text = []
# Add initial padding to the first sentence
for _ in range(num_padding):
padded_text.append("<s>")
for word in text:
padded_text.append(word)
# Every time we see an end punctuation mark, add <s> tokens before it
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
if word in [".", "?", "!"]:
for _ in range(num_padding):
padded_text.append("<s>")
return padded_text