File size: 1,470 Bytes

f5eb6b9

import os
current_dir = os.path.dirname(os.path.realpath(__file__))
os.chdir(current_dir)

class PerCharTokenizer:
  """
  Args:
    - chars (list): all bases along with special tokens represented as characters
    - vocab_size (int): size of vocabulary

  Working:
    - vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token
    - encode(): iterates over each character a time and the looks up for the position in vocab
      and returns it's position as integer
    - decode(): takes input of a list of integers and returns the specific item from vocab
  """
  def __init__(self):
    super().__init__()
    self.chars = ['\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' ']
    self.vocab_size = len(self.chars)
    self.string_to_index = {ch: i for i, ch in enumerate(self.chars)}
    self.index_to_string = {i: ch for i, ch in enumerate(self.chars)}

  def encode(self, string):
    encoded = []
    for char in string:
      if char in self.string_to_index:
        encoded.append(self.string_to_index[char])
      else:
        special_index = len(self.string_to_index)
        self.string_to_index[char] = special_index
        self.index_to_string[special_index] = char
        encoded.append(special_index)
    return encoded
  
  def decode(self, integer):
    decoded = []
    for i in integer:
      if i in self.index_to_string:
        decoded.append(self.index_to_string[i])
      else:
        continue
    return ''.join(decoded)