|
import os |
|
current_dir = os.path.dirname(os.path.realpath(__file__)) |
|
os.chdir(current_dir) |
|
|
|
class PerCharTokenizer: |
|
""" |
|
Args: |
|
- chars (list): all bases along with special tokens represented as characters |
|
- vocab_size (int): size of vocabulary |
|
|
|
Working: |
|
- vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token |
|
- encode(): iterates over each character a time and the looks up for the position in vocab |
|
and returns it's position as integer |
|
- decode(): takes input of a list of integers and returns the specific item from vocab |
|
""" |
|
def __init__(self): |
|
super().__init__() |
|
self.chars = ['\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' '] |
|
self.vocab_size = len(self.chars) |
|
self.string_to_index = {ch: i for i, ch in enumerate(self.chars)} |
|
self.index_to_string = {i: ch for i, ch in enumerate(self.chars)} |
|
|
|
def encode(self, string): |
|
encoded = [] |
|
for char in string: |
|
if char in self.string_to_index: |
|
encoded.append(self.string_to_index[char]) |
|
else: |
|
special_index = len(self.string_to_index) |
|
self.string_to_index[char] = special_index |
|
self.index_to_string[special_index] = char |
|
encoded.append(special_index) |
|
return encoded |
|
|
|
def decode(self, integer): |
|
decoded = [] |
|
for i in integer: |
|
if i in self.index_to_string: |
|
decoded.append(self.index_to_string[i]) |
|
else: |
|
continue |
|
return ''.join(decoded) |