Spaces:
Sleeping
Sleeping
import os | |
import re | |
import unicodedata | |
def strip_accents(text: str) -> str: | |
"""Removes accents from text.""" | |
return ''.join(c for c in unicodedata.normalize('NFD', text) | |
if unicodedata.category(c) != 'Mn') | |
def load_raw_text(corpus_directory: str, file_names=None) -> str: | |
"""Loads all the text files in a directory into one large string""" | |
corpus = "" | |
for file_name in os.listdir(corpus_directory): | |
# Read the file as a string | |
file_path = os.path.join(corpus_directory, file_name) | |
if os.path.isdir(file_path): | |
continue | |
# Make sure we only read text files | |
if ".txt" not in file_name: | |
continue | |
with open(file_path, 'r') as file: | |
file_contents = file.read() | |
corpus += (file_contents + "\n") | |
return corpus | |
def load_single_raw_text_file(file_name): | |
"""Loads a single text file into one large string""" | |
corpus = "" | |
with open(file_name, 'r') as file: | |
file_contents = file.read() | |
corpus += (file_contents + "\n") | |
return corpus | |
word_regex = r"[\w|\']+" | |
def tokenize(text): | |
return re.findall(word_regex, text) | |
def preprocess(text): | |
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation.""" | |
text = strip_accents(text) | |
text = text.lower() | |
tokens = text.split(" ") | |
tokens_filtered = [] | |
for token in tokens: | |
# Skip any tokens with special characters | |
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token): | |
tokens_filtered.append(token) | |
return tokens_filtered | |
def pad(text: list, num_padding: int): | |
"""Pads the given text, as a list of strings, with <s> characters between sentences.""" | |
padded_text = [] | |
# Add initial padding to the first sentence | |
for _ in range(num_padding): | |
padded_text.append("<s>") | |
for word in text: | |
padded_text.append(word) | |
# Every time we see an end punctuation mark, add <s> tokens before it | |
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION | |
if word in [".", "?", "!"]: | |
for _ in range(num_padding): | |
padded_text.append("<s>") | |
return padded_text | |