enigma-1.5b / tokenizer /perChar.py
shivendrra's picture
added tokenizer files
f5eb6b9 verified
raw
history blame
1.47 kB
import os
current_dir = os.path.dirname(os.path.realpath(__file__))
os.chdir(current_dir)
class PerCharTokenizer:
"""
Args:
- chars (list): all bases along with special tokens represented as characters
- vocab_size (int): size of vocabulary
Working:
- vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token
- encode(): iterates over each character a time and the looks up for the position in vocab
and returns it's position as integer
- decode(): takes input of a list of integers and returns the specific item from vocab
"""
def __init__(self):
super().__init__()
self.chars = ['\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' ']
self.vocab_size = len(self.chars)
self.string_to_index = {ch: i for i, ch in enumerate(self.chars)}
self.index_to_string = {i: ch for i, ch in enumerate(self.chars)}
def encode(self, string):
encoded = []
for char in string:
if char in self.string_to_index:
encoded.append(self.string_to_index[char])
else:
special_index = len(self.string_to_index)
self.string_to_index[char] = special_index
self.index_to_string[special_index] = char
encoded.append(special_index)
return encoded
def decode(self, integer):
decoded = []
for i in integer:
if i in self.index_to_string:
decoded.append(self.index_to_string[i])
else:
continue
return ''.join(decoded)