from threading import Thread import constants from pathlib import Path import random from typing import Union, Any, List from interfaces import IProcess, IProcessor from processes import ( RandomCharRemover, RandomCharsInjector, RandomCharsSwapper, RandomNeighborReplacer, RandomWordsCollapsor, PunctuationRemover, SentencePermutation, ) class FilesProcessor(IProcessor): def __init__( self, processes: List[IProcess], n_dist: int = 32 ) -> None: self.processes = processes self.n_dist = n_dist self.__dist = False self.__cache = [] def file_run(self, file: Union[str, Path]) -> Any: result = file for process in self.processes: result = process.execute(result) return result def run( self, files: List[Union[str, Path]] ) -> Any: result = list(map(self.file_run, files)) if self.__dist is True: self.__cache.append(result) return return result def _divde(self, data: List[Any]): items_per_div = len(data) // self.n_dist divs = [] for i in range(items_per_div): start = i * items_per_div end = (i + 1) * items_per_div if i == (items_per_div - 1): end = len(divs) divs.append(data[start: end]) return divs def dist_run( self, files: List[Union[str, Path]] ) -> Any: self.__dist = True self.__cache = [] divs = self._divde(files) threads = [] for div in divs: t = Thread(target=self.run, args=(div,)) t.start() threads.append(t) for t in threads: t.join() self.__dist = False results = [] for item in self.__cache: results.extend(item) self.__cache = [] return results class TextDistorter(IProcessor): def __init__( self, ratio: float, processes: List[IProcess] ) -> None: super().__init__() self.ratio = ratio self.processes = processes def run(self, line: str) -> str: length = len(line) n = int(self.ratio * length) for _ in range(n): line = random.choice(self.processes).execute(line) return line def dist_run(self): # TODO pass class TextProcessor(IProcessor): def __init__(self, processes: List[IProcess]) -> None: super().__init__() self.processes = processes def run(self, sentence: str): for process in self.processes: sentence = process.execute(sentence) return sentence def dist_run(self, sentence: str) -> str: return self.run(sentence) def get_text_distorter(ratio, sentences: List[str]): return TextDistorter( ratio=ratio, processes=[ SentencePermutation(sentences), RandomCharsInjector(constants.KURDISH_CHARS), RandomCharsSwapper(), RandomCharRemover(), RandomWordsCollapsor(), RandomNeighborReplacer( constants.KEYBOARD_KEYS, constants.KEYBOARD_BLANK ) ] )