fix fastchat_t5_3b
Browse files- tokenizer/tiktoken_patch.py +10 -2
- vocab/fastchat_t5_3b/__init__.py +21 -1
tokenizer/tiktoken_patch.py
CHANGED
@@ -7,11 +7,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
|
7 |
默认的decode,可能会报错,详见 decode_test.py
|
8 |
skip_special_tokens 是为了兼容 hf_tokenizer
|
9 |
|
10 |
-
errors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""
|
12 |
try:
|
13 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
14 |
-
except:
|
|
|
15 |
decode_str = "null"
|
16 |
return decode_str
|
17 |
|
|
|
7 |
默认的decode,可能会报错,详见 decode_test.py
|
8 |
skip_special_tokens 是为了兼容 hf_tokenizer
|
9 |
|
10 |
+
errors:
|
11 |
+
decoded bytes are not guaranteed to be valid UTF-8.
|
12 |
+
"strict" Raise UnicodeError
|
13 |
+
"ignore" Ignore and continue
|
14 |
+
"replace" Replace with replacement character
|
15 |
+
"backslashreplace" Replace with backslashed escape sequence
|
16 |
+
"xmlcharrefreplace" Replace with XML character reference
|
17 |
+
"namereplace" Replace with \N{...} (named unicode character)
|
18 |
"""
|
19 |
try:
|
20 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
21 |
+
except Exception as e:
|
22 |
+
logger.error(f"{e} -> return 'null'")
|
23 |
decode_str = "null"
|
24 |
return decode_str
|
25 |
|
vocab/fastchat_t5_3b/__init__.py
CHANGED
@@ -1,3 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoTokenizer
|
2 |
|
3 |
-
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
## 默认 use_fast=True 报错
|
4 |
+
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 190, in <module>
|
5 |
+
print(iter_vocab(tokenizer, name=name))
|
6 |
+
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 144, in iter_vocab
|
7 |
+
dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
|
8 |
+
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 34, in get_coding_length
|
9 |
+
tokens = tokenizer.encode(word)
|
10 |
+
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 2600, in encode
|
11 |
+
encoded_inputs = self.encode_plus(
|
12 |
+
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 3008, in encode_plus
|
13 |
+
return self._encode_plus(
|
14 |
+
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 576, in _encode_plus
|
15 |
+
batched_output = self._batch_encode_plus(
|
16 |
+
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus
|
17 |
+
encodings = self._tokenizer.encode_batch(
|
18 |
+
pyo3_runtime.PanicException: AddedVocabulary bad split
|
19 |
+
"""
|
20 |
+
|
21 |
from transformers import AutoTokenizer
|
22 |
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False)
|