|
from transformers import BertTokenizerFast |
|
from transformers.models.bert_japanese.tokenization_bert_japanese import MecabTokenizer |
|
|
|
class MecabPreTokenizer(MecabTokenizer): |
|
def mecab_split(self,i,normalized_string): |
|
t=str(normalized_string) |
|
e=0 |
|
z=[] |
|
for c in self.tokenize(t): |
|
s=t.find(c,e) |
|
e=e if s<0 else s+len(c) |
|
z.append((0,0) if s<0 else (s,e)) |
|
return [normalized_string[s:e] for s,e in z if e>0] |
|
def pre_tokenize(self,pretok): |
|
pretok.split(self.mecab_split) |
|
|
|
class BertMecabTokenizerFast(BertTokenizerFast): |
|
def __init__(self,vocab_file,do_lower_case=False,tokenize_chinese_chars=False,**kwargs): |
|
from tokenizers import pre_tokenizers,normalizers |
|
super().__init__(vocab_file=vocab_file,do_lower_case=do_lower_case,tokenize_chinese_chars=tokenize_chinese_chars,**kwargs) |
|
d=kwargs["mecab_kwargs"] if "mecab_kwargs" in kwargs else {"mecab_dic":"ipadic"} |
|
self._tokenizer.normalizer=normalizers.Sequence([normalizers.Nmt(),normalizers.NFKC()]) |
|
self.custom_pre_tokenizer=pre_tokenizers.Sequence([pre_tokenizers.PreTokenizer.custom(MecabPreTokenizer(**d)),pre_tokenizers.BertPreTokenizer()]) |
|
self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer |
|
def save_pretrained(self,save_directory,**kwargs): |
|
import os |
|
import shutil |
|
from tokenizers.pre_tokenizers import Metaspace |
|
self._auto_map={"AutoTokenizer":[None,"mecab.BertMecabTokenizerFast"]} |
|
self._tokenizer.pre_tokenizer=Metaspace() |
|
super().save_pretrained(save_directory,**kwargs) |
|
self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer |
|
shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"mecab.py")) |
|
|
|
|