|
""" |
|
|
|
## 默认 use_fast=True 报错 |
|
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 190, in <module> |
|
print(iter_vocab(tokenizer, name=name)) |
|
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 144, in iter_vocab |
|
dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k)) |
|
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 34, in get_coding_length |
|
tokens = tokenizer.encode(word) |
|
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 2600, in encode |
|
encoded_inputs = self.encode_plus( |
|
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 3008, in encode_plus |
|
return self._encode_plus( |
|
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 576, in _encode_plus |
|
batched_output = self._batch_encode_plus( |
|
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus |
|
encodings = self._tokenizer.encode_batch( |
|
pyo3_runtime.PanicException: AddedVocabulary bad split |
|
""" |
|
|
|
from transformers import AutoTokenizer |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False) |