Spaces:

ariahmed
/

kurd-spell-app

Runtime error

App Files Files Community

kurd-spell-app / prepare_data /processes.py

ariahmed

Upload folder using huggingface_hub

e489264 verified 3 months ago

raw

history blame contribute delete

9.97 kB

	import random
	import re
	from typing import List, Union
	from interfaces import IProcess
	from helpers import get_freq_dict, load_text_file, remove_long_spaces
	from transformers import AutoTokenizer

	class LoadFile(IProcess):

	def execute(self, file_path: str):
	return load_text_file(
	file_path
	)


	class LinesSplitter(IProcess):
	def __init__(self, sep: str) -> None:
	super().__init__()
	self.sep = sep

	def split(self, line):
	return line.split(self.sep)

	def execute(self, data: Union[List[str], str]) -> List[str]:
	if isinstance(data, str):
	return data.split(self.sep)
	results = []
	for lines in map(self.split, data):
	results.extend(lines)
	return results


	class LengthFilter(IProcess):
	def __init__(
	self, min_length: int, max_length: int
	) -> None:
	super().__init__()
	self.min_length = min_length
	self.max_length = max_length

	def execute(self, lines: List[str]):
	return list(filter(
	lambda x: self.min_length <= len(x) <= self.max_length, lines
	))


	class WordsNumberFilter(IProcess):
	def __init__(self, min_words: int, max_words: int) -> None:
	super().__init__()
	self.min_words = min_words
	self.max_words = max_words

	def _is_valid(self, line: str) -> bool:
	return self.min_words < line.count(' ') < self.max_words

	def execute(self, lines: List[str]):
	return list(filter(self._is_valid, lines))

	class TokenizerLengthFilter(IProcess):
	def __init__(self, max_length: int = 1024) -> None:
	super().__init__()
	self.max_length = max_length
	self.tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

	def _is_valid(self, line: str) -> bool:
	data = self.tokenizer.batch_encode_plus([line], max_length=self.max_length, truncation=True,return_overflowing_tokens=True )
	if len(data["input_ids"]) > 1:
	return True
	else:
	return False

	def execute(self, lines: List[str]):
	return list(filter(self._is_valid, lines))


	class WordsFilter(IProcess):
	def __init__(self, words: List[str]) -> None:
	super().__init__()
	self.words = set(words)

	def _not_contain(self, line: str) -> bool:
	return not any((
	word in line for word in self.words
	))

	def execute(self, lines: List[str]):
	return list(filter(self._not_contain, lines))


	class SoloCharFilter(IProcess):

	def _not_contain(self, line: str) -> bool:
	return re.search('^. \| . \| .$', line) is None

	def execute(self, lines: List[str]):
	return list(filter(self._not_contain, lines))


	class NumbersFilter(IProcess):

	def _not_contain(self, line: str) -> bool:
	return re.search('[0-9]+', line) is None

	def execute(self, lines: List[str]):
	return list(filter(self._not_contain, lines))


	class OOVFilter(IProcess):
	def __init__(self, max_oov: int) -> None:
	super().__init__()
	self.max_oov = max_oov
	self.__freq = {}

	def _is_valid(self, line: str):
	counter = 0
	for word in line.split(' '):
	counter += (self.__freq[word] == 1)
	return counter < self.max_oov

	def execute(self, lines: List[str]):
	self.__freq = get_freq_dict(lines)
	return list(filter(self._is_valid, lines))

	# text = ["کوردستان وڵاتی کوردانە هەی هەی هەی هەی", "کورد بوون گەوادیە", "ژیان سەختە"]
	# result = OOVFilter(5).execute(text)
	# print(result)


	class CharsRemover(IProcess):
	def __init__(self, chars: str) -> None:
	super().__init__()
	self.pat = f'[{chars}]'

	def remove(self, line: str) -> str:
	return re.sub(self.pat, '', line)

	def execute(self, lines: List[str]) -> List[str]:
	return list(map(self.remove, lines))


	class RepeatedCharsCollapsor(IProcess):
	def __init__(self, max_repeteion: int) -> None:
	super().__init__()
	self.pat = r"(.)\1{}".format(f"{{{2},}}")

	def collaps(self, line: str) -> str:
	return re.sub(self.pat, r"\1" * 1, line)

	def execute(self, lines: List[str]) -> List[str]:
	return list(map(self.collaps, lines))


	class ValidCharsKeeper(IProcess):
	def __init__(self, valid_chars: str, rep_with=' ') -> None:
	super().__init__()
	self.valid_chars = valid_chars
	self.rep_with = rep_with
	self.pat = f'[^{self.valid_chars}]'

	def __keep(self, line: str) -> str:
	return re.sub(self.pat, ' ', line)

	def execute(self, lines: List[str]) -> List[str]:
	return list(map(self.__keep, lines))


	class SpacesRemover(IProcess):

	def __remove(self, line: str) -> str:
	return remove_long_spaces(line).strip()

	def execute(self, lines: List[str]):
	return list(map(self.__remove, lines))


	class RandomCharsInjector(IProcess):
	def __init__(self, chars: str) -> None:
	super().__init__()
	self.chars = chars

	def get_char(self) -> str:
	return random.choice(self.chars)

	def execute(self, line: str):
	length = len(line)
	idx = random.randint(0, length - 1)
	return line[:idx] + self.get_char() + line[idx:]

	class PunctuationRemover(IProcess):
	def __init__(self) -> None:
	super().__init__()
	self.clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:»«](?!\d)")

	def __remove_punctuation(self, text: str):
	"""Remove all punctuation from string, except if it's between digits"""
	return self.clean_punctuation.sub("", text)

	def execute(self, line: str):
	return self.__remove_punctuation(line)


	class RandomCharsSwapper(IProcess):

	def execute(self, line: str) -> str:
	length = len(line)
	idx = random.randint(0, length - 2)
	return line[:idx] + line[idx + 1] + line[idx] + line[idx + 2:]


	class RandomCharRemover(IProcess):

	def execute(self, line: str) -> str:
	length = len(line)
	idx = random.randint(0, length - 1)
	return line[:idx] + line[idx + 1:]


	class RandomWordsCollapsor(IProcess):

	def execute(self, line: str) -> str:
	indices = [
	i for i, char in enumerate(line)
	if char == ' '
	]
	if len(indices) == 0:
	return line
	idx = random.choice(indices)
	return line[: idx] + line[idx + 1:]


	class RandomNeighborReplacer(IProcess):

	def __init__(self, keyboard_rows: List[str], blank: str) -> None:
	super().__init__()
	self.lines = keyboard_rows
	self.blank = blank
	self.n_rows = len(keyboard_rows)
	self._mapper = {}
	self.set_mapper()

	def __get_left(
	self, row_idx: int, col_idx: int
	) -> List[str]:
	if col_idx == 0:
	return []
	return [self.lines[row_idx][col_idx - 1]]

	def __get_right(
	self, row_idx: int, col_idx: int
	) -> List[str]:
	if col_idx == (len(self.lines[row_idx]) - 1):
	return []
	return self.lines[row_idx][col_idx + 1]

	def __get_upper(
	self, row_idx: int, col_idx: int
	) -> List[str]:
	if row_idx == 0:
	return []
	line = self.lines[row_idx - 1]
	start = max(0, col_idx - 1)
	end = min(len(line), col_idx + 2)
	return list(line[start: end])

	def __get_lower(
	self, row_idx: int, col_idx: int
	) -> List[str]:
	if row_idx == (self.n_rows - 1):
	return []
	line = self.lines[row_idx + 1]
	start = max(0, col_idx - 1)
	end = min(len(line), col_idx + 2)
	return list(line[start: end])

	def set_mapper(self) -> None:
	funcs = [
	self.__get_left,
	self.__get_right,
	self.__get_upper,
	self.__get_lower
	]
	for row_idx in range(self.n_rows):
	for col_idx in range(len(self.lines[row_idx])):
	items = []
	for func in funcs:
	items.extend(func(row_idx, col_idx))
	items = list(
	filter(lambda x: x != self.blank, items)
	)
	char = self.lines[row_idx][col_idx]
	self._mapper[char] = items.copy()

	def get_char(self, char: str) -> str:
	if char not in self._mapper:
	return char
	return random.choice(self._mapper[char])

	def execute(self, line: str) -> str:
	length = len(line)
	idx = random.randint(0, length - 1)
	return line[:idx] + self.get_char(line[idx]) + line[idx + 1:]


	class CharsNormalizer(IProcess):

	def __init__(self, mapper: dict) -> None:
	super().__init__()
	self.mapper = mapper

	def _normalize(self, line: str) -> str:
	for key, value in self.mapper.items():
	line = line.replace(key, value)
	return line

	def execute(self, lines: List[str]):
	return list(filter(self._normalize, lines))

	class SentencePermutation(IProcess):

	def __init__(self, sentences: List[str], augmentation_probability: float = 1) -> None:
	super().__init__()
	self.sentences = sentences
	self.augmentation_probability = augmentation_probability

	def _combine(self, text: str) -> str:
	if random.random() < self.augmentation_probability:
	sentences_to_sample = random.randint(0,10)
	augmentation_sentences = random.sample(self.sentences, sentences_to_sample)
	return text + " " + " ".join(augmentation_sentences)
	else:
	return text

	def execute(self, line: str) -> str:
	# return [self._combine(line) for line in lines]
	return self._combine(line)