add more tokenizers
Browse files- vocab/README.md +2 -0
- vocab/__init__.py +13 -2
- vocab/chatglm_6b/__init__.py +8 -7
- vocab/code_davinci_002/__init__.py +3 -0
- vocab/deepseek_coder_33b_instruct/__init__.py +7 -0
- vocab/deepseek_llm_7b_base/__init__.py +5 -0
- vocab/gpt_35_turbo/__init__.py +0 -2
- vocab/text_davinci_003/__init__.py +70 -0
- vocab/tigerbot_13b_chat_v2/__init__.py +5 -0
- vocab/tigerbot_70b_chat_v4_4k/__init__.py +5 -0
- vocab/wizardcoder_15b_v1/__init__.py +4 -0
- vocab/wizardcoder_python_7b_v1/__init__.py +4 -0
- vocab/wizardlm_7b_v1/__init__.py +4 -0
- vocab/wizardmath_70b_v1/__init__.py +4 -0
vocab/README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
|
|
|
|
|
2 |
对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
|
3 |
之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
|
4 |
|
|
|
1 |
|
2 |
+
https://arxiv.org/abs/2308.16692 SpeechTokenizer
|
3 |
+
|
4 |
对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
|
5 |
之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
|
6 |
|
vocab/__init__.py
CHANGED
@@ -55,8 +55,6 @@ uniq_tokenizers = [
|
|
55 |
all_tokenizers = [
|
56 |
"gpt2",
|
57 |
"gpt2_chinese",
|
58 |
-
"gpt_35_turbo",
|
59 |
-
"gpt_4",
|
60 |
|
61 |
# bert 系列
|
62 |
"bert_base_cased",
|
@@ -105,6 +103,10 @@ all_tokenizers = [
|
|
105 |
"qwen_1_8b_chat",
|
106 |
"qwen_7b_chat",
|
107 |
"qwen_72b_chat",
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# 未分类
|
110 |
"skywork_13b_base",
|
@@ -116,6 +118,15 @@ all_tokenizers = [
|
|
116 |
"flan_t5_base",
|
117 |
"fastchat_t5_3b",
|
118 |
"pko_t5_large",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
|
121 |
]
|
|
|
55 |
all_tokenizers = [
|
56 |
"gpt2",
|
57 |
"gpt2_chinese",
|
|
|
|
|
58 |
|
59 |
# bert 系列
|
60 |
"bert_base_cased",
|
|
|
103 |
"qwen_1_8b_chat",
|
104 |
"qwen_7b_chat",
|
105 |
"qwen_72b_chat",
|
106 |
+
"text_davinci_003",
|
107 |
+
"code_davinci_002",
|
108 |
+
"gpt_35_turbo",
|
109 |
+
"gpt_4",
|
110 |
|
111 |
# 未分类
|
112 |
"skywork_13b_base",
|
|
|
118 |
"flan_t5_base",
|
119 |
"fastchat_t5_3b",
|
120 |
"pko_t5_large",
|
121 |
+
"wizardcoder_15b_v1",
|
122 |
+
"wizardcoder_python_7b_v1",
|
123 |
+
"wizardlm_7b_v1",
|
124 |
+
"wizardmath_70b_v1",
|
125 |
+
"tigerbot_70b_chat_v4_4k",
|
126 |
+
"tigerbot_13b_chat_v2",
|
127 |
+
"deepseek_coder_33b_instruct",
|
128 |
+
"deepseek_llm_7b_base",
|
129 |
+
|
130 |
|
131 |
|
132 |
]
|
vocab/chatglm_6b/__init__.py
CHANGED
@@ -6,15 +6,16 @@ import os
|
|
6 |
import config
|
7 |
from transformers import AutoTokenizer
|
8 |
|
9 |
-
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
10 |
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
|
19 |
# https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
|
20 |
tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
|
|
|
6 |
import config
|
7 |
from transformers import AutoTokenizer
|
8 |
|
|
|
9 |
|
10 |
|
11 |
+
|
12 |
+
# if config.USE_REMOTE:
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
14 |
+
# else:
|
15 |
+
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
16 |
+
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
|
18 |
+
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
19 |
|
20 |
# https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
|
21 |
tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
|
vocab/code_davinci_002/__init__.py
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from vocab.text_davinci_003 import tokenizer
|
vocab/deepseek_coder_33b_instruct/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://huggingface.co/spaces/deepseek-ai/deepseek-coder-7b-instruct
|
3 |
+
"""
|
4 |
+
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-instruct", trust_remote_code=True)
|
vocab/deepseek_llm_7b_base/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -42,8 +42,6 @@ def get_vocab(self, token_type="str"):
|
|
42 |
key_error_list = []
|
43 |
unicode_decode_error_list = []
|
44 |
for i in range(self.vocab_size):
|
45 |
-
if i == 100256:
|
46 |
-
print(i)
|
47 |
try:
|
48 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
49 |
if token_byte is None:
|
|
|
42 |
key_error_list = []
|
43 |
unicode_decode_error_list = []
|
44 |
for i in range(self.vocab_size):
|
|
|
|
|
45 |
try:
|
46 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
47 |
if token_byte is None:
|
vocab/text_davinci_003/__init__.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO
|
3 |
+
"""
|
4 |
+
|
5 |
+
import tiktoken
|
6 |
+
from tiktoken import Encoding
|
7 |
+
from utils.log_util import logger
|
8 |
+
|
9 |
+
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
10 |
+
tokenizer.vocab_size = tokenizer.n_vocab
|
11 |
+
|
12 |
+
tokenizer.comments = ""
|
13 |
+
tokenizer.reversible = True
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
19 |
+
"""
|
20 |
+
默认的decode,可能会报错,详见 decode_test.py
|
21 |
+
skip_special_tokens 是为了兼容 hf_tokenizer
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
25 |
+
except:
|
26 |
+
decode_str = "null"
|
27 |
+
return decode_str
|
28 |
+
|
29 |
+
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
30 |
+
"""
|
31 |
+
为什么没有这个方法?
|
32 |
+
"""
|
33 |
+
try:
|
34 |
+
return tokenizer.decode_tokens_bytes(tokens)
|
35 |
+
except:
|
36 |
+
# 什么要返回None?见zh_util.py
|
37 |
+
# 16个空闲id, 100256 100261-100275
|
38 |
+
return [None for token in tokens]
|
39 |
+
|
40 |
+
def get_vocab(self, token_type="str"):
|
41 |
+
"""Returns vocab as a dict
|
42 |
+
:param token_type: ["str", "byte"]
|
43 |
+
:return:
|
44 |
+
"""
|
45 |
+
vocab = {}
|
46 |
+
key_error_list = []
|
47 |
+
unicode_decode_error_list = []
|
48 |
+
for i in range(self.vocab_size):
|
49 |
+
try:
|
50 |
+
token_byte = self.convert_ids_to_tokens([i])[0]
|
51 |
+
if token_byte is None:
|
52 |
+
continue
|
53 |
+
# token_str = token_byte.decode("utf-8")
|
54 |
+
vocab[token_byte] = i
|
55 |
+
|
56 |
+
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
57 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
58 |
+
vocab[token_byte] = i
|
59 |
+
|
60 |
+
# vocab.update(self.added_tokens_encoder)
|
61 |
+
logger.info(f"text-davinci-003 {len(key_error_list)} KeyError: {key_error_list}")
|
62 |
+
logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
63 |
+
return vocab
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# tiktoken patch
|
68 |
+
Encoding.decode = decode
|
69 |
+
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
70 |
+
Encoding.get_vocab = get_vocab
|
vocab/tigerbot_13b_chat_v2/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-13b-chat-v2", trust_remote_code=True)
|
vocab/tigerbot_70b_chat_v4_4k/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-70b-chat-v4-4k", trust_remote_code=True)
|
vocab/wizardcoder_15b_v1/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|
vocab/wizardcoder_python_7b_v1/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-Python-7B-V1.0", trust_remote_code=True)
|
vocab/wizardlm_7b_v1/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardLM-7B-V1.0", trust_remote_code=True)
|
vocab/wizardmath_70b_v1/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardMath-70B-V1.0", trust_remote_code=True)
|