add olmo tokenizer
Browse files- requirements.txt +2 -1
- vocab/__init__.py +24 -15
- vocab/olmo_7b/__init__.py +4 -0
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ icetk
|
|
5 |
torch
|
6 |
zhon
|
7 |
nltk
|
8 |
-
boto3
|
|
|
|
5 |
torch
|
6 |
zhon
|
7 |
nltk
|
8 |
+
boto3
|
9 |
+
ai2-olmo
|
vocab/__init__.py
CHANGED
@@ -17,14 +17,18 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
17 |
|
18 |
- bert
|
19 |
- 特征
|
|
|
20 |
- 示例:
|
21 |
-
- gpt2
|
22 |
-
- 特征:
|
23 |
-
- 词典:
|
24 |
-
|
25 |
- sentencepiece:
|
26 |
-
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
29 |
- glm, chatglm1, chatglm2
|
30 |
- tiktoken
|
@@ -32,6 +36,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
32 |
- 特征:
|
33 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
34 |
- added_tokens 在vocab中不一定存在。
|
|
|
|
|
|
|
|
|
35 |
- .model 是 tokenizer.models.BPE 类型
|
36 |
- 词典有 Ġ "\u0120" 开头
|
37 |
- .model.from_file .model.save .model.token_to_id .model.tokenize
|
@@ -116,7 +124,7 @@ all_tokenizers = [
|
|
116 |
"code_davinci_002",
|
117 |
"gpt_35_turbo",
|
118 |
"gpt_4",
|
119 |
-
|
120 |
# 未分类
|
121 |
"skywork_13b_base",
|
122 |
"skywork_13b_math",
|
@@ -141,20 +149,21 @@ all_tokenizers = [
|
|
141 |
"switch_c_2048",
|
142 |
"byt5_small",
|
143 |
"mt5_large",
|
144 |
-
"wizardcoder_python_7b_v1",
|
145 |
-
"wizardlm_7b_v1",
|
146 |
-
"wizardmath_70b_v1",
|
147 |
-
"tigerbot_70b_chat_v4_4k",
|
148 |
-
"tigerbot_13b_chat_v2",
|
149 |
-
"deepseek_coder_33b_instruct",
|
150 |
-
"deepseek_llm_7b_base",
|
151 |
"gemma_7b",
|
152 |
-
|
153 |
|
154 |
]
|
155 |
|
156 |
all_tokenizers = sorted(all_tokenizers)
|
157 |
|
|
|
158 |
class TokenizerType(Enum):
|
159 |
"""
|
160 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
|
|
17 |
|
18 |
- bert
|
19 |
- 特征
|
20 |
+
- 词典:有##开头的token,表示subword
|
21 |
- 示例:
|
|
|
|
|
|
|
|
|
22 |
- sentencepiece:
|
23 |
+
- 特征:
|
24 |
+
- 训练:
|
25 |
+
- 文件: *.sp_model 或 *.model (可选文件 .vocab,)
|
26 |
+
- 实现:
|
27 |
+
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
28 |
+
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
29 |
+
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
30 |
+
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
31 |
+
- 示例:llama,baichuan, orion
|
32 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
33 |
- glm, chatglm1, chatglm2
|
34 |
- tiktoken
|
|
|
36 |
- 特征:
|
37 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
38 |
- added_tokens 在vocab中不一定存在。
|
39 |
+
- 实现:
|
40 |
+
- 训练:
|
41 |
+
- 加载:
|
42 |
+
- 方法:
|
43 |
- .model 是 tokenizer.models.BPE 类型
|
44 |
- 词典有 Ġ "\u0120" 开头
|
45 |
- .model.from_file .model.save .model.token_to_id .model.tokenize
|
|
|
124 |
"code_davinci_002",
|
125 |
"gpt_35_turbo",
|
126 |
"gpt_4",
|
127 |
+
|
128 |
# 未分类
|
129 |
"skywork_13b_base",
|
130 |
"skywork_13b_math",
|
|
|
149 |
"switch_c_2048",
|
150 |
"byt5_small",
|
151 |
"mt5_large",
|
152 |
+
"wizardcoder_python_7b_v1",
|
153 |
+
"wizardlm_7b_v1",
|
154 |
+
"wizardmath_70b_v1",
|
155 |
+
"tigerbot_70b_chat_v4_4k",
|
156 |
+
"tigerbot_13b_chat_v2",
|
157 |
+
"deepseek_coder_33b_instruct",
|
158 |
+
"deepseek_llm_7b_base",
|
159 |
"gemma_7b",
|
160 |
+
"olmo_7b",
|
161 |
|
162 |
]
|
163 |
|
164 |
all_tokenizers = sorted(all_tokenizers)
|
165 |
|
166 |
+
|
167 |
class TokenizerType(Enum):
|
168 |
"""
|
169 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
vocab/olmo_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B")
|