import os current_dir = os.path.dirname(os.path.realpath(__file__)) os.chdir(current_dir) class PerCharTokenizer: """ Args: - chars (list): all bases along with special tokens represented as characters - vocab_size (int): size of vocabulary Working: - vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token - encode(): iterates over each character a time and the looks up for the position in vocab and returns it's position as integer - decode(): takes input of a list of integers and returns the specific item from vocab """ def __init__(self): super().__init__() self.chars = ['\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' '] self.vocab_size = len(self.chars) self.string_to_index = {ch: i for i, ch in enumerate(self.chars)} self.index_to_string = {i: ch for i, ch in enumerate(self.chars)} def encode(self, string): encoded = [] for char in string: if char in self.string_to_index: encoded.append(self.string_to_index[char]) else: special_index = len(self.string_to_index) self.string_to_index[char] = special_index self.index_to_string[special_index] = char encoded.append(special_index) return encoded def decode(self, integer): decoded = [] for i in integer: if i in self.index_to_string: decoded.append(self.index_to_string[i]) else: continue return ''.join(decoded)