Spaces:

TDLI2024
/

Aru_Thaqana

Sleeping

Aru_Thaqana / util.py

Alexis Palmer

Scramble para Aymara, first version

29d61ac 10 months ago

2.31 kB

	import os
	import re
	import unicodedata

	def strip_accents(text: str) -> str:
	"""Removes accents from text."""
	return ''.join(c for c in unicodedata.normalize('NFD', text)
	if unicodedata.category(c) != 'Mn')


	def load_raw_text(corpus_directory: str, file_names=None) -> str:
	"""Loads all the text files in a directory into one large string"""
	corpus = ""

	for file_name in os.listdir(corpus_directory):
	# Read the file as a string
	file_path = os.path.join(corpus_directory, file_name)
	if os.path.isdir(file_path):
	continue

	# Make sure we only read text files
	if ".txt" not in file_name:
	continue

	with open(file_path, 'r') as file:
	file_contents = file.read()
	corpus += (file_contents + "\n")
	return corpus

	def load_single_raw_text_file(file_name):
	"""Loads a single text file into one large string"""

	corpus = ""
	with open(file_name, 'r') as file:
	file_contents = file.read()
	corpus += (file_contents + "\n")

	return corpus


	word_regex = r"[\w\|\']+"
	def tokenize(text):
	return re.findall(word_regex, text)


	def preprocess(text):
	"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
	text = strip_accents(text)
	text = text.lower()

	tokens = text.split(" ")

	tokens_filtered = []
	for token in tokens:
	# Skip any tokens with special characters
	if re.match(r"[\w\|\']+\|[\.\|\,\|\?\|\!]", token):
	tokens_filtered.append(token)
	return tokens_filtered


	def pad(text: list, num_padding: int):
	"""Pads the given text, as a list of strings, with <s> characters between sentences."""
	padded_text = []

	# Add initial padding to the first sentence
	for _ in range(num_padding):
	padded_text.append("<s>")

	for word in text:
	padded_text.append(word)

	# Every time we see an end punctuation mark, add <s> tokens before it
	# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
	if word in [".", "?", "!"]:
	for _ in range(num_padding):
	padded_text.append("<s>")


	return padded_text