vietnamese_bi_encoder / custom_tokenizer.py
nhatminh's picture
Upload 16 files
0b6001d verified
raw
history blame contribute delete
325 Bytes
from transformers import PhobertTokenizer
from pyvi import ViTokenizer
class CustomPhobertTokenizer(PhobertTokenizer):
def rdr_segment(self, text):
return ViTokenizer.tokenize(text)
def _tokenize(self, text):
segmented_text = self.rdr_segment(text)
return super()._tokenize(segmented_text)