xu-song commited on
Commit
a6c67ec
·
1 Parent(s): 7011963

fix tiktoken

Browse files
Files changed (38) hide show
  1. examples.py +22 -5
  2. tokenizer/tiktoken_patch.py +69 -0
  3. util.py +1 -0
  4. vocab/__init__.py +1 -1
  5. vocab/gpt_35_turbo/__init__.py +2 -69
  6. vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json +0 -0
  7. vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json +0 -0
  8. vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +0 -0
  9. vocab/gpt_neox_chinese_v1/README.md +0 -64
  10. vocab/gpt_neox_chinese_v1/__init__.py +0 -14
  11. vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py +0 -61
  12. vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py +0 -50
  13. vocab/gpt_neox_chinese_v1/mock.py +0 -32
  14. vocab/gpt_neox_chinese_v1/test_tokenizer.py +0 -43
  15. vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json +0 -0
  16. vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json +0 -0
  17. vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json +0 -0
  18. vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json +0 -0
  19. vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json +0 -0
  20. vocab/gpt_neox_chinese_v1/to_v2/README.md +0 -3
  21. vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +0 -185
  22. vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py +0 -205
  23. vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt +0 -0
  24. vocab/gpt_neox_chinese_v1/to_v2/oov.txt +0 -0
  25. vocab/gpt_neox_chinese_v1/to_v2/sort_test.py +0 -18
  26. vocab/gpt_neox_chinese_v1/to_v2/test2.py +0 -42
  27. vocab/gpt_neox_chinese_v1/to_v2/test_oov.py +0 -69
  28. vocab/gpt_neox_chinese_v1/to_v2/test_queue.py +0 -20
  29. vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl +0 -0
  30. vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl +0 -0
  31. vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt +0 -0
  32. vocab/gpt_neox_chinese_v1/tokenizer/__init__.py +0 -16
  33. vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py +0 -368
  34. vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py +0 -402
  35. vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py +0 -126
  36. vocab/gpt_neox_chinese_v1/trouble-shooting.md +0 -22
  37. vocab/moss/__init__.py +1 -1
  38. vocab/text_davinci_003/__init__.py +14 -59
examples.py CHANGED
@@ -1,12 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  examples = {
2
  "en": [
3
- ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
 
4
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
7
- ["number: (10086 + 98) = 100184", "baichuan", "llama"]
8
- ]
9
- ,
10
  "zh": [
11
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
12
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
@@ -14,7 +32,6 @@ examples = {
14
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
15
  ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
16
  ]
17
-
18
  }
19
 
20
  more_examples = [
 
1
+ """
2
+
3
+ ## characters
4
+
5
+ - alphanumeric characters
6
+ - numeric characters
7
+ - special characters: A special character is a character that is not an alphabetic or numeric character.
8
+ - ASCII control characters
9
+ - punctuation marks
10
+ - accent marks
11
+ - 数学符号
12
+ - whitespace:
13
+ - https://en.wikipedia.org/wiki/Whitespace_character
14
+ - https://emptycharacter.com/
15
+
16
+
17
+ https://www.computerhope.com/jargon/s/specchar.htm
18
+ """
19
+
20
  examples = {
21
  "en": [
22
+ ["number: (10086 + 98) = 100184", "llama", "bloom"],
23
+ ["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
26
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
27
+ ],
 
 
28
  "zh": [
29
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
30
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
 
32
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
33
  ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
34
  ]
 
35
  }
36
 
37
  more_examples = [
tokenizer/tiktoken_patch.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from tiktoken import Encoding
3
+ from utils.log_util import logger
4
+
5
+ def decode(self, tokens, errors="replace", skip_special_tokens=False):
6
+ """
7
+ 默认的decode,可能会报错,详见 decode_test.py
8
+ skip_special_tokens 是为了兼容 hf_tokenizer
9
+ """
10
+ try:
11
+ decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
12
+ except:
13
+ decode_str = "null"
14
+ return decode_str
15
+
16
+
17
+ def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
18
+ """
19
+ 为什么没有这个方法?
20
+ """
21
+ try:
22
+ return self.decode_tokens_bytes(tokens)
23
+ except Exception as e:
24
+ # 什么要返回None?见zh_util.py
25
+ # 16个空闲id, 100256 100261-100275
26
+ logger.error(e)
27
+ return [None for _ in tokens]
28
+
29
+
30
+ def get_vocab(self, token_type="str"):
31
+ """Returns vocab as a dict
32
+ :param token_type: ["str", "byte"]
33
+ :return:
34
+ """
35
+ vocab = {}
36
+ key_error_list = []
37
+ unicode_decode_error_list = []
38
+ for i in range(self.vocab_size):
39
+ try:
40
+ token_byte = self.convert_ids_to_tokens([i])[0]
41
+ if token_byte is None:
42
+ continue
43
+ # token_str = token_byte.decode("utf-8")
44
+ vocab[token_byte] = i
45
+
46
+ except UnicodeDecodeError: # 773 UnicodeDecodeError
47
+ unicode_decode_error_list.append((i, str(token_byte)))
48
+ vocab[token_byte] = i
49
+
50
+ # vocab.update(self.added_tokens_encoder)
51
+ logger.info(f"{self.name} {len(key_error_list)} KeyError: {key_error_list}")
52
+ logger.info(f"{self.name} {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
53
+ return vocab
54
+
55
+
56
+ def encode(self, *args, **kwargs):
57
+ """
58
+ add_special_token 是为了兼容 hf_tokenizer
59
+ """
60
+ kwargs.pop("add_special_tokens", None)
61
+ return self._encode(*args, **kwargs)
62
+
63
+
64
+ # tiktoken patch
65
+ Encoding._encode = Encoding.encode
66
+ Encoding.encode = encode
67
+ Encoding.decode = decode
68
+ Encoding.convert_ids_to_tokens = convert_ids_to_tokens
69
+ Encoding.get_vocab = get_vocab
util.py CHANGED
@@ -52,6 +52,7 @@ def tokenize(text, tokenizer_type, color_num=5):
52
  # continue
53
 
54
  # ⭐
 
55
  table.append(
56
  {"TokenID": token_id,
57
  "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
 
52
  # continue
53
 
54
  # ⭐
55
+ # TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
56
  table.append(
57
  {"TokenID": token_id,
58
  "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
vocab/__init__.py CHANGED
@@ -85,7 +85,7 @@ all_tokenizers = [
85
  # "gpt_neox_chinese_v1",
86
  #
87
  # ##### glm系列
88
- "glm_chinese",
89
  "chatglm_6b",
90
  "chatglm2_6b",
91
  "chatglm3_6b",
 
85
  # "gpt_neox_chinese_v1",
86
  #
87
  # ##### glm系列
88
+ # "glm_chinese",
89
  "chatglm_6b",
90
  "chatglm2_6b",
91
  "chatglm3_6b",
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -1,10 +1,9 @@
1
  """
2
- ,请
3
  """
4
 
5
  import tiktoken
6
- from tiktoken import Encoding
7
- from utils.log_util import logger
8
 
9
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
10
  tokenizer.vocab_size = tokenizer.n_vocab
@@ -12,69 +11,3 @@ tokenizer.vocab_size = tokenizer.n_vocab
12
  tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
13
  tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
14
 
15
-
16
- def decode(self, tokens, errors="replace", skip_special_tokens=False):
17
- """
18
- 默认的decode,可能会报错,详见 decode_test.py
19
- skip_special_tokens 是为了兼容 hf_tokenizer
20
- """
21
- try:
22
- decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
23
- except:
24
- decode_str = "null"
25
- return decode_str
26
-
27
-
28
- def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
29
- """
30
- 为什么没有这个方法?
31
- """
32
- try:
33
- return self.decode_tokens_bytes(tokens)
34
- except Exception as e:
35
- # 什么要返回None?见zh_util.py
36
- # 16个空闲id, 100256 100261-100275
37
- logger.error(e)
38
- return [None for _ in tokens]
39
-
40
-
41
- def get_vocab(self, token_type="str"):
42
- """Returns vocab as a dict
43
- :param token_type: ["str", "byte"]
44
- :return:
45
- """
46
- vocab = {}
47
- key_error_list = []
48
- unicode_decode_error_list = []
49
- for i in range(self.vocab_size):
50
- try:
51
- token_byte = self.convert_ids_to_tokens([i])[0]
52
- if token_byte is None:
53
- continue
54
- # token_str = token_byte.decode("utf-8")
55
- vocab[token_byte] = i
56
-
57
- except UnicodeDecodeError: # 773 UnicodeDecodeError
58
- unicode_decode_error_list.append((i, str(token_byte)))
59
- vocab[token_byte] = i
60
-
61
- # vocab.update(self.added_tokens_encoder)
62
- logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
63
- logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
64
- return vocab
65
-
66
-
67
- def encode(self, *args, **kwargs):
68
- """
69
- add_special_token 是为了兼容 hf_tokenizer
70
- """
71
- kwargs.pop("add_special_tokens", None)
72
- return self._encode(*args, **kwargs)
73
-
74
-
75
- # tiktoken patch
76
- Encoding._encode = Encoding.encode
77
- Encoding.encode = encode
78
- Encoding.decode = decode
79
- Encoding.convert_ids_to_tokens = convert_ids_to_tokens
80
- Encoding.get_vocab = get_vocab
 
1
  """
2
+
3
  """
4
 
5
  import tiktoken
6
+ import tokenizer.tiktoken_patch
 
7
 
8
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
9
  tokenizer.vocab_size = tokenizer.n_vocab
 
11
  tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
12
  tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/README.md DELETED
@@ -1,64 +0,0 @@
1
-
2
-
3
- ```
4
- added vocab (size: 54634) with 22 dummy tokens (new size: 54656)
5
- Vocab size: 54634
6
-
7
- 训练数据
8
- ```
9
-
10
-
11
- https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
12
-
13
-
14
- ## 20B
15
-
16
- [configs/20B.yml](https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml#L7)
17
- ```
18
- "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
19
- ```
20
-
21
- Vocab size: 50277
22
- self.padded_vocab_size = 50304
23
-
24
-
25
- padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
26
-
27
- ## 词典
28
-
29
- 见 convert_vocab_to_txt.py
30
-
31
- ```
32
- {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
33
-
34
- # 多个符号拼接在一起的
35
- {"id": 13663, "token": ".*]{}", "token_decode": ".*]{}"} .*]{}
36
-
37
- # ss
38
-
39
- ```
40
-
41
-
42
- ## 中文支持
43
-
44
- 基本没有OOV。
45
-
46
- gpt-neox是在800G英文数据集上训练的,为啥词典支持中文?因为是byte-level BPE
47
-
48
- ```
49
- 丁 [3218, 212]
50
- 七 [3218, 214]
51
- 万 [3218, 218]
52
- 诀 [11894, 211]
53
- 证 [11894, 212]
54
- ```
55
-
56
-
57
- 编码长度统计: Counter({2: 4190, 3: 1295, 1: 285})
58
- 平均编码长度: 2.1750433275563257
59
-
60
-
61
- ## ss
62
-
63
-
64
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/__init__.py DELETED
@@ -1,14 +0,0 @@
1
-
2
- import os
3
- from tokenizers import Tokenizer
4
-
5
-
6
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
7
- TOKENIZER_DIR = os.path.join(CURRENT_DIR, "20B_tokenizer_chinese.json")
8
-
9
- tokenizer = Tokenizer.from_file(TOKENIZER_DIR)
10
-
11
- tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
12
-
13
- # vocab_size = len(tokenizer.get_vocab())
14
- # vocab_size = tokenizer.vocab_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py DELETED
@@ -1,61 +0,0 @@
1
- """
2
- merge 是干嘛的?
3
-
4
- ## 结果
5
-
6
- 共merge 4357 个 token
7
- """
8
-
9
- import json
10
- from tokenizers import Tokenizer
11
- from data_sample.oov_base import jd_vocab_tokens
12
- from zhon.hanzi import punctuation as zh_punc
13
-
14
- def load_base_tokenizer(vocab_path):
15
- data = json.load(open(vocab_path, "r", encoding="utf-8"))
16
- tokenizer = Tokenizer.from_file(vocab_path)
17
- print("vocab_size with added_tokens:", )
18
- return data, tokenizer
19
-
20
- data, base_tokenizer = load_base_tokenizer("../gpt_nexo_20b/20B_tokenizer.json")
21
- vocab = data["model"]["vocab"]
22
- merges = data["model"]["merges"]
23
- vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
24
-
25
-
26
- """
27
- 方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
28
- 以下采用方式一。
29
- """
30
- new_added_tokens = {}
31
- for word in jd_vocab_tokens + list(zh_punc):
32
- if len(word) > 1 or word in new_added_tokens:
33
- continue
34
- encoding = base_tokenizer.encode(word)
35
- # if len(encoding.ids) > 1:
36
- if len(encoding.ids) == 2: # 3个的,怎么处理?
37
- tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
38
- # print("merging", vocab_size, word, json.dumps(tokens))
39
- vocab["".join(tokens)] = vocab_size
40
- new_added_tokens[word] = vocab_size
41
- vocab_size += 1
42
- merges.append(" ".join(tokens))
43
-
44
-
45
-
46
- print("共merge %d 个 token" % (len(new_added_tokens)))
47
-
48
- with open("20B_tokenizer_chinese.json", "w", encoding="utf-8") as f_out:
49
- json.dump(data, f_out, indent=2)
50
-
51
- ## check
52
- tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
53
- all_error_ids = []
54
- for word, idx in new_added_tokens.items():
55
- decode_str = tokenizer.decode([idx])
56
- if word != decode_str:
57
- all_error_ids.append(idx)
58
- print(idx, word, decode_str)
59
-
60
- print(all_error_ids)
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py DELETED
@@ -1,50 +0,0 @@
1
- """
2
- merge 是干嘛的?
3
-
4
- ## 结果
5
-
6
- 共merge 4357 个 token
7
- """
8
-
9
- import json
10
- from tokenizers import Tokenizer
11
- from data_sample.oov_base import jd_vocab_tokens
12
- from zhon.hanzi import punctuation as zh_punc
13
-
14
- def load_base_tokenizer():
15
- old_vocab_path = "../gpt_nexo_20b/20B_tokenizer.json"
16
- data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
17
- tokenizer = Tokenizer.from_file(old_vocab_path)
18
- print("vocab_size with added_tokens:", )
19
- return data, tokenizer
20
-
21
- data, base_tokenizer = load_base_tokenizer()
22
- vocab = data["model"]["vocab"]
23
- merges = data["model"]["merges"]
24
- vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
25
-
26
-
27
- """
28
- 方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
29
- 以下采用方式一。
30
- """
31
- new_added_tokens = set()
32
- for word in jd_vocab_tokens + list(zh_punc):
33
- if len(word) > 1 or word in new_added_tokens:
34
- continue
35
- encoding = base_tokenizer.encode(word)
36
- # if len(encoding.ids) > 1:
37
- if len(encoding.ids) == 2: # 3个的,怎么处理?
38
- tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
39
- print("merging", vocab_size, word, json.dumps(tokens))
40
- vocab["".join(tokens)] = vocab_size
41
- vocab_size += 1
42
- merges.append(" ".join(tokens))
43
- new_added_tokens.add(word)
44
-
45
-
46
- print("共merge %d 个 token" % (len(new_added_tokens)))
47
-
48
- f_out = open("20B_tokenizer_chinese_2.json", "w", encoding="utf-8")
49
-
50
- json.dump(data, f_out, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/mock.py DELETED
@@ -1,32 +0,0 @@
1
- import copy
2
- import json
3
- from tokenizers import Tokenizer
4
-
5
- def export_mock_tokenizer():
6
- input_path = "20B_tokenizer_chinese.json"
7
-
8
- tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
9
-
10
- vocab = tokenizer["model"]["vocab"]
11
- added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
12
-
13
- for k, v in copy.deepcopy(vocab).items():
14
- if v not in added_tokens:
15
- vocab[str(v)] = v
16
- vocab.pop(k)
17
-
18
- out_path = input_path.replace(".json", ".mock.json")
19
- with open(out_path, "w", encoding="utf-8") as f_out:
20
- f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
21
-
22
-
23
- def mock2():
24
- pass
25
-
26
-
27
- def load_mock_tokenizer():
28
- tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
29
- print('')
30
-
31
- export_mock_tokenizer()
32
- load_mock_tokenizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/test_tokenizer.py DELETED
@@ -1,43 +0,0 @@
1
- import json
2
- from tokenizers import Tokenizer
3
-
4
- tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
5
- print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
6
- print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False))
7
-
8
- def test_token():
9
- """
10
- :return:
11
- """
12
- text = " \t\n中国解决方法黑白侗鸩玥,。!"
13
- # text = open("../../data_sample/EBKE20150806001_epub_30198917_30198917.txt", "r", encoding="utf-8").readline()
14
- encoding = tokenizer.encode(text)
15
- decoding = tokenizer.decode(encoding.ids)
16
- print(decoding)
17
- for word in text:
18
- encoding = tokenizer.encode(word)
19
- for token_id in encoding.ids:
20
- decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
21
- token = tokenizer.id_to_token(token_id)
22
- print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
23
-
24
- def test_encode():
25
- text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗<|endoftext|>一 个刹车卉"
26
- encoding = tokenizer.encode(text)
27
- print(tokenizer.decode(encoding.ids))
28
- for token_id in encoding.ids:
29
- decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
30
- token = tokenizer.id_to_token(token_id)
31
- print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
32
-
33
- def test_decode():
34
- encoding = [30903, 20287, 20005, 52300, 25949, 30329, 50039, 31949, 25538,
35
- 34698, 18764, 5225, 53915, 163, 223]
36
-
37
- decode_str = tokenizer.decode(encoding, skip_special_tokens=False)
38
- print(decode_str)
39
-
40
- # test_token()
41
- test_encode()
42
- # test_decode()
43
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/README.md DELETED
@@ -1,3 +0,0 @@
1
-
2
- 扩充词典到 v2
3
-
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py DELETED
@@ -1,185 +0,0 @@
1
-
2
-
3
-
4
-
5
- import shutil
6
- import json
7
- from queue import Queue
8
- from tokenizers import Tokenizer
9
- from data_sample.oov_base import jd_vocab_tokens
10
- from zhon.hanzi import punctuation as zh_punc
11
-
12
- def load_base_tokenizer(tokenizer_path):
13
- print("loading", tokenizer_path)
14
- data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
15
- tokenizer = Tokenizer.from_file(tokenizer_path)
16
- print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
17
- return data, tokenizer
18
-
19
-
20
- def insert_token(word, index):
21
- pass
22
-
23
- # 不能删除的token。比如初始统计是低频的,可以删除,但是新增词典里包含的。
24
-
25
-
26
- def load_reserve_tokens(word_list, base_tokenizer):
27
- data, base_tokenizer = base_tokenizer
28
- reserved_token = set()
29
- for word in word_list:
30
- encoding = base_tokenizer.encode(word)
31
- tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
32
- for i in range(0, len(encoding.ids)):
33
- reserved_token.add("".join(tokens[:i+1]))
34
- return reserved_token
35
-
36
-
37
- reserved_token = set()
38
-
39
-
40
- def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=None):
41
- """
42
- append token to the end of vocab
43
- """
44
- new_vocab = set()
45
- new_merges = set()
46
-
47
- data, base_tokenizer = base_tokenizer
48
- vocab = data["model"]["vocab"]
49
- merges = data["model"]["merges"]
50
- vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
51
-
52
- for word in word_list:
53
- encoding = base_tokenizer.encode(word)
54
- if len(encoding.ids) == 1:
55
- continue
56
-
57
- if len(encoding.ids) >= 4:
58
- print("[ERROR]: encoding不能超过4", word, encoding)
59
-
60
- tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
61
- # print("merging", word, json.dumps(tokens))
62
- for i in range(1, len(encoding.ids)):
63
- new_vocab.add("".join(tokens[:i+1]))
64
- new_merges.add("".join(tokens[:i]) + " " + tokens[i])
65
-
66
- # append to the end of vocab
67
- # print("new_vocab size", len(new_vocab))
68
- # print("new_merges size", len(new_merges))
69
- if unused_ids == None:
70
- for token in new_vocab:
71
- vocab[token] = vocab_size
72
- vocab_size += 1
73
- merges += new_merges
74
- else:
75
- for iddx, token in enumerate(new_vocab):
76
- # print(unused_ids.qsize())
77
- unused_token_id, unused_token_str, unused_merges = unused_ids.get()
78
- if unused_token_id == 39468:
79
- print("catch")
80
- if unused_token_str in reserved_token:
81
- print("skip unused token", unused_token_id, unused_token_str)
82
- unused_token_id, unused_token_str, unused_merges = unused_ids.get()
83
-
84
- print("[%d]merging %s to unused %s %s" % (unused_ids.qsize(), json.dumps(token), unused_token_id, json.dumps(unused_token_str)) )
85
- vocab[token] = unused_token_id
86
- if unused_token_id != vocab.pop(unused_token_str):
87
- print("ERROR")
88
- # assert unused_token_id == vocab.pop(unused_token_str)
89
- merges.remove(unused_merges)
90
- # print(new_merges)
91
- merges += new_merges
92
-
93
- # print("共merge %d 个 token" % (len(new_vocab)))
94
- # print(json.dumps(list(new_vocab)))
95
-
96
-
97
- with open(output_tokenizer_path, "w", encoding="utf-8") as f_out:
98
- json.dump(data, f_out, indent=2)
99
-
100
- return data, base_tokenizer
101
-
102
-
103
-
104
-
105
- # data, base_tokenizer = load_base_tokenizer(output_tokenizer_path)
106
- # encoding = base_tokenizer.encode(word)
107
- # print(encoding.ids)
108
-
109
-
110
- def load_unused_id():
111
- unused_ids = Queue(maxsize=0)
112
- for line in open("word_count.corpus.remove.jsonl", "r", encoding="utf-8"):
113
- line_data = json.loads(line)
114
- token_id = line_data["id"]
115
- token_str = line_data["token"]
116
- merges = line_data["merges"]
117
- unused_ids.put((token_id, token_str, merges))
118
- # for i in range(2000):
119
- # unused_ids.get()
120
- return unused_ids
121
-
122
-
123
- def check_tokenize(base_tokenizer, word):
124
- data, base_tokenizer = base_tokenizer
125
- encodings = base_tokenizer.encode(word)
126
- assert len(encodings.ids) == 1
127
- assert base_tokenizer.decode(encodings.ids) == word
128
-
129
-
130
- def add_tokens():
131
-
132
-
133
- unused_ids = load_unused_id()
134
- add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
135
- add_chars = [char for token in add_tokens for char in token]
136
- add_chars = list(set(add_chars))
137
- add_words = [token for token in add_tokens if len(token) > 1]
138
-
139
-
140
- tokenizer_path = "../20B_tokenizer_chinese.json"
141
- # tokenizer_path = "../../gpt_nexo_20b/20B_tokenizer.json"
142
- base_tokenizer = load_base_tokenizer(tokenizer_path)
143
- reserved_token.update(load_reserve_tokens(add_chars, base_tokenizer))
144
-
145
- ## add chars
146
- append_token(add_chars, base_tokenizer, "20B_tokenizer.1.json", unused_ids=unused_ids)
147
- print(unused_ids.qsize()) # 22320
148
- new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
149
-
150
- append_token(add_words,
151
- new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
152
- new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
153
-
154
- #
155
- # ## add words
156
- # while unused_ids.qsize() != 22320:
157
- # unused_ids.get()
158
- # assert unused_ids.qsize() == 22320
159
- #
160
- # shutil.copyfile("20B_tokenizer.1.json", "20B_tokenizer.2.json")
161
- # while len(add_words) > 0:
162
- # new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
163
- # append_token([add_words.pop()],
164
- # new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
165
- # # new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
166
-
167
-
168
- def check_all_tokens():
169
- add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
170
- add_chars = [char for token in add_tokens for char in token]
171
- add_chars = list(set(add_chars))
172
- add_words = [token for token in add_tokens if len(token) > 1]
173
- # add_chars = ['吳']
174
- base_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
175
- for k in add_chars:
176
- check_tokenize(base_tokenizer, k)
177
- for word in add_words:
178
- # print(word)
179
- check_tokenize(base_tokenizer, word)
180
-
181
- add_tokens()
182
- check_all_tokens()
183
-
184
-
185
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py DELETED
@@ -1,205 +0,0 @@
1
- """
2
- 获取超低频token,用于裁剪
3
- """
4
-
5
- import copy
6
- import glob
7
- import json
8
- from collections import defaultdict
9
-
10
-
11
- def word_count():
12
- from collections import Counter
13
- from megatron.data.indexed_dataset import MMapIndexedDataset
14
- counter = Counter()
15
- for file_name in glob.glob("data/jd/*.bin"):
16
- print(file_name)
17
- file_name = file_name[:-4]
18
- dataset = MMapIndexedDataset(file_name, skip_warmup=True)
19
- for doc in dataset:
20
- counter.update(doc)
21
-
22
- f_out = open("word_count.txt", "w", encoding="utf-8")
23
- for token_id, count in counter.most_common():
24
- f_out.write("%d\t%d\n" % (token_id, count))
25
-
26
-
27
- def get_unused_id():
28
- pass
29
-
30
-
31
- def print_word_count():
32
- from tokenizers import Tokenizer
33
- tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
34
- data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
35
-
36
- vocab = data["model"]["vocab"]
37
- merges = data["model"]["merges"]
38
- merge_dict = {}
39
-
40
- sorted_parts = []
41
- for merge in merges:
42
- idx = merge.find(" ")
43
- token_str = merge[:idx] + merge[idx + 1:]
44
- merge_dict[token_str] = (merge[:idx], merge[idx + 1:])
45
- sorted_parts += [token_str, merge[:idx], merge[idx + 1:]]
46
- id2vocab = {idx: token for token, idx in vocab.items()}
47
-
48
- # 补充 sorted_parts,并排序
49
- all_tokens = [line.strip().split("\t") for line in open("word_count.corpus.txt", "r", encoding="utf-8")]
50
- raw_token_count = {int(token_id): int(count) for token_id, count in all_tokens}
51
- sorted_parts = set(sorted_parts)
52
- for token_id in raw_token_count:
53
- if token_id in [35448, 40519]:
54
- print("ddd")
55
- token_str = id2vocab[token_id]
56
- if token_str not in sorted_parts:
57
- sorted_parts.add(token_str)
58
- # print(token_id, token_str, json.dumps(token_str), raw_token_count[token_id], " not in parts")
59
- sorted_parts = sorted(set(sorted_parts), key=lambda k: len(k), reverse=True)
60
-
61
- # 重新计算merge的频率
62
- # token_count = copy.deepcopy(raw_token_count)
63
- token_count = defaultdict(int)
64
- for token_str in sorted_parts: # 从长到短 遍历 (否则要深度遍历,)
65
- token_id = vocab[token_str]
66
- if token_id in [35448, 40519]:
67
- print("ddd")
68
-
69
- count = raw_token_count.get(token_id, 0)
70
- token_count[token_id] += count # 原token 的词频
71
- if token_str in merge_dict:
72
- if vocab[merge_dict[token_str][0]] in [35448, 40519] or vocab[merge_dict[token_str][1]] in [35448, 40519]:
73
- print("ddd")
74
- token_count[vocab[merge_dict[token_str][0]]] += token_count[token_id]
75
- token_count[vocab[merge_dict[token_str][1]]] += token_count[token_id]
76
- else:
77
- print(token_id, json.dumps(token_str))
78
-
79
-
80
- # 重新排序 (按频率升序排列,相同频率按长度降序排列)
81
- sorted_token_count = sorted(token_count.items(), key=lambda kv: (kv[1], -len(id2vocab[kv[0]])))
82
- f_out = open("word_count.corpus.sort_by_count.jsonl", "w", encoding="utf-8")
83
- for token_id, count in sorted_token_count:
84
- # for token_str, count in token_count.items():
85
- token_str = id2vocab[token_id]
86
- # token_id = vocab[token_str]
87
- decode_str = tokenizer.decode([token_id]) # 解码会失真
88
- if token_str in merge_dict:
89
- merges = " ".join(merge_dict[token_str])
90
- else:
91
- merges = "NULL"
92
- f_out.write(json.dumps(
93
- {"id": token_id, "token": token_str, "merges": merges, "raw_count": raw_token_count.get(token_id, 0),
94
- "count": count, "decode_str": decode_str}) + "\n")
95
-
96
-
97
- def get_remove_words():
98
- from tokenizers import Tokenizer
99
- tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
100
-
101
- data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
102
- added_tokens = [token["id"] for token in data["added_tokens"]]
103
-
104
- vocab = data["model"]["vocab"]
105
- merges = data["model"]["merges"]
106
- id2vocab = {idx: token for token, idx in vocab.items()}
107
-
108
- merge_dict = {k.replace(" ", "", 1): k for k in merges}
109
-
110
- token_count = {}
111
- for line in open("word_count.corpus.sort_by_count.jsonl", "r", encoding="utf-8"):
112
- line_data = json.loads(line)
113
- token_id = int(line_data["id"])
114
- count = int(line_data["count"])
115
- token_count[token_id] = count
116
-
117
- f_out = open("word_count.corpus.remove.jsonl", "w", encoding="utf-8")
118
- remove_vocab_set = set()
119
-
120
- # # 1. 去掉错误token
121
- # error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
122
- # 54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]
123
- # for token_id in error_tokens:
124
- # token_str = id2vocab[token_id]
125
- # # token_str = tokenizer.id_to_token(token_id) # 失真
126
- # remove_vocab_set.add(token_id)
127
- # f_out.write(json.dumps(
128
- # {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": 0,
129
- # "type": "error-char"}) + "\n")
130
-
131
-
132
- # 2. 去掉超长token
133
- # for token_id in range(tokenizer.get_vocab_size()):
134
- # if token_id in added_tokens:
135
- # continue
136
- # token_str = id2vocab[token_id]
137
- # # token_str = tokenizer.id_to_token(token_id) # 也会失真,比如 54611 个token
138
- # decode_str = tokenizer.decode([token_id]) # decode会失真,比如 Ġ 会变成空格
139
- # if len(decode_str) > 8 and len(set(decode_str)) < 3:
140
- # if token_id in remove_vocab_set:
141
- # continue
142
- # remove_vocab_set.add(token_id)
143
- # f_out.write(
144
- # json.dumps({"id": token_id, "token": token_str,
145
- # "merges": merge_dict.get(token_str), "count": token_count.get(token_id, 0),
146
- # "type": "按长度过滤"}, ensure_ascii=False) + "\n")
147
- #
148
- # # 删除依赖,(否则会造成 merges中存在oov的token)
149
- # #
150
- # for merge in merges:
151
- # if token_str in merge:
152
- # # if token_str + " " in merge or " " + token_str in merge:
153
- # parent_token_str = merge.replace(" ", "", 1)
154
- # parent_token_id = vocab[parent_token_str]
155
- # if parent_token_id in remove_vocab_set:
156
- # continue
157
- # remove_vocab_set.add(parent_token_id)
158
- # f_out.write(
159
- # json.dumps({"id": parent_token_id, "token": parent_token_str,
160
- # "merges": merge, "count": token_count.get(parent_token_id, 0),
161
- # "type": "按长度过滤-依赖删除"}, ensure_ascii=False) + "\n")
162
-
163
- # 3. 去掉低频token
164
- for token_id, count in list(token_count.items())[:25000]:
165
- # token_id = 6460
166
- if token_id in added_tokens:
167
- continue
168
- if token_id in remove_vocab_set:
169
- continue
170
-
171
- token_str = tokenizer.id_to_token(token_id)
172
- # token_str = tokenizer.decode([int(token_id)])
173
- if len(token_str.strip()) > 1:
174
- remove_vocab_set.add(token_id)
175
- f_out.write(json.dumps(
176
- {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": count,
177
- "type": "remove by frequency"}) + "\n")
178
-
179
- ######## 已经按频率排序的,就不需要删除依赖了
180
- # # 删除依赖,(否则会造成 merges中存在oov的token)
181
- # for merge in merges:
182
- # # if token_str + " " in merge or " " + token_str in merge:
183
- # if token_str in merge:
184
- # parent_token_str = merge.replace(" ", "", 1)
185
- # parent_token_id = vocab[parent_token_str]
186
- # if parent_token_id in remove_vocab_set:
187
- # continue
188
- # remove_vocab_set.add(parent_token_id)
189
- # f_out.write(
190
- # json.dumps({"id": parent_token_id, "token": parent_token_str,
191
- # "merges": merge, "count": token_count.get(parent_token_id, 0),
192
- # "type": "按频率过滤-依赖删除"}, ensure_ascii=False) + "\n")
193
-
194
- # remove 24969 tokens
195
- print("remove %d tokens" % (len(remove_vocab_set)))
196
-
197
-
198
- def ss():
199
- pass
200
-
201
-
202
- # word_count()
203
- # print_word_count()
204
- get_remove_words()
205
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/oov.txt DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/sort_test.py DELETED
@@ -1,18 +0,0 @@
1
-
2
-
3
-
4
- a = {
5
- "aa", 1,
6
- "aaa", 1,
7
- "aaaa", 1,
8
- "aaaaaa", 1,
9
- "aaaaaaa", 1,
10
-
11
- "baa", 3,
12
- "baaa", 2,
13
- "baaaa", 2,
14
- "baaaaaa", 2,
15
- "baaaaaaa", 2,
16
- }
17
-
18
- sorted(a.items(), key=lambda kv:(kv[1], ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/test2.py DELETED
@@ -1,42 +0,0 @@
1
- import json
2
- from tokenizers import Tokenizer
3
- from data_sample.oov_base import jd_vocab_tokens
4
- from zhon.hanzi import punctuation as zh_punc
5
-
6
- def load_base_tokenizer(tokenizer_path):
7
- print("loading", tokenizer_path)
8
- data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
9
- tokenizer = Tokenizer.from_file(tokenizer_path)
10
- print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
11
- return data, tokenizer
12
-
13
-
14
- def append_token(word_list, base_tokenizer, unused_ids=None):
15
- """
16
- append token to the end of vocab
17
- """
18
- new_vocab = set()
19
- new_merges = set()
20
-
21
- data, base_tokenizer = base_tokenizer
22
- vocab = data["model"]["vocab"]
23
- merges = data["model"]["merges"]
24
- vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
25
-
26
- for word in word_list:
27
- encoding = base_tokenizer.encode(word)
28
- if len(encoding.ids) == 1:
29
- continue
30
-
31
- if len(encoding.ids) >= 4:
32
- print("[ERROR]: encoding不能超过4", word, encoding)
33
-
34
- tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
35
- if "\u00e6\u00a5\u0143" in tokens:
36
- print(word)
37
-
38
- add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
39
- add_words = [token for token in add_tokens if len(token) > 1]
40
- new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
41
-
42
- append_token(add_words, new_tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/test_oov.py DELETED
@@ -1,69 +0,0 @@
1
- from tokenizers import Tokenizer
2
-
3
- tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
4
-
5
- def get_oov():
6
-
7
- f_out = open("oov.txt", "w", encoding="utf-8")
8
- all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
9
- for line in all_words:
10
- word, count = line.strip().split("\t")
11
- if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
12
- continue
13
-
14
- encoding = tokenizer.encode(word)
15
- if len(encoding.ids) > 1:
16
- f_out.write(line)
17
-
18
-
19
- def build_vocab():
20
- pass
21
-
22
-
23
-
24
- def convert_oov_to_merges():
25
- """将词拆分成merge分组,必须是两个一组,
26
- 比如
27
- 承担 -> 承 担
28
- 天津市 -> 天津 市
29
- 社会保障 -> 社会 保障
30
- 的一部分 -> 的 一部分 -> 一 部分
31
- """
32
- all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
33
- all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过
34
- len1 = [token for token in all_tokens if len(token) == 1]
35
- len2 = [token for token in all_tokens if len(token) == 2]
36
- len3 = [token for token in all_tokens if len(token) == 3]
37
- len4 = [token for token in all_tokens if len(token) == 4]
38
- print(len(len1), len(len2), len(len3), len(len4))
39
-
40
- # vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
41
- # vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)
42
-
43
-
44
- with open("oov.add.txt", "w", encoding="utf-8") as f_out:
45
- for token in len1:
46
- f_out.write(token + "\n")
47
- for token in len2[:20000]:
48
- f_out.write(token + "\n")
49
- # f_out.write(token[0] + " " + token[1] + "\n")
50
-
51
- # for token in len3:
52
- # idx = -1
53
- # for part in len2:
54
- # if part in token:
55
- # idx = token.find(part)
56
- # break
57
- # if idx == -1:
58
- # print("not found", token)
59
- # elif idx == 0:
60
- # f_out.write(token[0] + " " + token[1:] + "\n")
61
- # else:
62
- # f_out.write(token[:2] + " " + token[2] + "\n")
63
-
64
-
65
-
66
-
67
-
68
- get_oov()
69
- convert_oov_to_merges()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/test_queue.py DELETED
@@ -1,20 +0,0 @@
1
-
2
- from queue import Queue
3
-
4
- q = Queue(maxsize=0)
5
-
6
- #写入队列数据
7
- q.put(0)
8
- q.put(1)
9
- q.put(2)
10
-
11
- #输出当前队列所有数据
12
- print(q.queue)
13
- #删除队列数据,并返回该数据
14
- q.get()
15
- #输也所有队列数据
16
- print(q.queue)
17
-
18
- for i in range(10):
19
- print(q.get(), q.qsize())
20
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt DELETED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/tokenizer/__init__.py DELETED
@@ -1,16 +0,0 @@
1
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
-
16
- from .tokenizer import build_tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py DELETED
@@ -1,368 +0,0 @@
1
- # Copyright (c) 2021, EleutherAI
2
- # This file is based on code by the authors denoted below and has been modified from its original version.
3
- #
4
- # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization classes for OpenAI GPT."""
19
-
20
- from __future__ import absolute_import, division, print_function, unicode_literals
21
-
22
- import sys
23
- import json
24
- import logging
25
- import os
26
- import regex as re
27
- from io import open
28
-
29
- from functools import lru_cache
30
-
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- PRETRAINED_VOCAB_ARCHIVE_MAP = {
35
- "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
36
- }
37
- PRETRAINED_MERGES_ARCHIVE_MAP = {
38
- "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
39
- }
40
- PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
41
- "gpt2": 1024,
42
- }
43
-
44
- VOCAB_NAME = "vocab.json"
45
- MERGES_NAME = "merges.txt"
46
- SPECIAL_TOKENS_NAME = "special_tokens.txt"
47
-
48
-
49
- @lru_cache()
50
- def bytes_to_unicode():
51
- """
52
- Returns list of utf-8 byte and a corresponding list of unicode strings.
53
- The reversible bpe codes work on unicode strings.
54
- This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
55
- When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
56
- This is a significant percentage of your normal, say, 32K bpe vocab.
57
- To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
58
- And avoids mapping to whitespace/control characters the bpe code barfs on.
59
- """
60
- _chr = unichr if sys.version_info[0] == 2 else chr
61
- bs = (
62
- list(range(ord("!"), ord("~") + 1))
63
- + list(range(ord("¡"), ord("¬") + 1))
64
- + list(range(ord("®"), ord("ÿ") + 1))
65
- )
66
- cs = bs[:]
67
- n = 0
68
- for b in range(2**8):
69
- if b not in bs:
70
- bs.append(b)
71
- cs.append(2**8 + n)
72
- n += 1
73
- cs = [_chr(n) for n in cs]
74
- return dict(zip(bs, cs))
75
-
76
-
77
- def get_pairs(word):
78
- """Return set of symbol pairs in a word.
79
-
80
- Word is represented as tuple of symbols (symbols being variable-length strings).
81
- """
82
- pairs = set()
83
- prev_char = word[0]
84
- for char in word[1:]:
85
- pairs.add((prev_char, char))
86
- prev_char = char
87
- return pairs
88
-
89
-
90
- class GPT2Tokenizer(object):
91
- """
92
- GPT-2 BPE tokenizer. Peculiarities:
93
- - Byte-level BPE
94
- """
95
-
96
- @classmethod
97
- def from_pretrained(
98
- cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
99
- ):
100
- """
101
- Instantiate a PreTrainedBertModel from a pre-trained model file.
102
- Download and cache the pre-trained model file if needed.
103
- """
104
- if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
105
- vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
106
- merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
107
- special_tokens_file = None
108
- else:
109
- vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
110
- merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
111
- special_tokens_file = os.path.join(
112
- pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
113
- )
114
- if not os.path.exists(special_tokens_file):
115
- special_tokens_file = None
116
- else:
117
- logger.info(
118
- "loading special tokens file {}".format(special_tokens_file)
119
- )
120
- # redirect to the cache, if necessary
121
- try:
122
- from .file_utils import cached_path
123
-
124
- resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
125
- resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
126
- except EnvironmentError:
127
- logger.error(
128
- "Model name '{}' was not found in model name list ({}). "
129
- "We assumed '{}' was a path or url but couldn't find files {} and {} "
130
- "at this path or url.".format(
131
- pretrained_model_name_or_path,
132
- ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
133
- pretrained_model_name_or_path,
134
- vocab_file,
135
- merges_file,
136
- )
137
- )
138
- return None
139
- if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
140
- logger.info("loading vocabulary file {}".format(vocab_file))
141
- logger.info("loading merges file {}".format(merges_file))
142
- else:
143
- logger.info(
144
- "loading vocabulary file {} from cache at {}".format(
145
- vocab_file, resolved_vocab_file
146
- )
147
- )
148
- logger.info(
149
- "loading merges file {} from cache at {}".format(
150
- merges_file, resolved_merges_file
151
- )
152
- )
153
- if (
154
- pretrained_model_name_or_path
155
- in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
156
- ):
157
- # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
158
- # than the number of positional embeddings
159
- max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
160
- pretrained_model_name_or_path
161
- ]
162
- kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
163
- # Instantiate tokenizer.
164
- if special_tokens_file and "special_tokens" not in kwargs:
165
- special_tokens = (
166
- open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
167
- )
168
- else:
169
- special_tokens = kwargs.pop("special_tokens", [])
170
- tokenizer = cls(
171
- resolved_vocab_file,
172
- resolved_merges_file,
173
- special_tokens=special_tokens,
174
- *inputs,
175
- **kwargs
176
- )
177
- return tokenizer
178
-
179
- def __init__(
180
- self,
181
- vocab_file,
182
- merges_file,
183
- errors="replace",
184
- special_tokens=None,
185
- max_len=None,
186
- ):
187
- self.max_len = max_len if max_len is not None else int(1e12)
188
- self.encoder = json.load(open(vocab_file))
189
- self.decoder = {v: k for k, v in self.encoder.items()}
190
- self.errors = errors # how to handle errors in decoding
191
- self.byte_encoder = bytes_to_unicode()
192
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
193
- bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
194
- bpe_merges = [tuple(merge.split()) for merge in bpe_data]
195
- self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
196
-
197
- # Should haved added re.IGNORECASE so BPE merges can happen for
198
- # capitalized versions of contractions
199
- self.pat = re.compile(
200
- r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
201
- )
202
-
203
- self.special_tokens = {}
204
- self.special_tokens_decoder = {}
205
- self.set_special_tokens(special_tokens)
206
-
207
- def __len__(self):
208
- return len(self.encoder) + len(self.special_tokens)
209
-
210
- def set_special_tokens(self, special_tokens):
211
- """Add a list of additional tokens to the encoder.
212
- The additional tokens are indexed starting from the last index of the
213
- current vocabulary in the order of the `special_tokens` list.
214
- """
215
- if not special_tokens:
216
- self.special_tokens = {}
217
- self.special_tokens_decoder = {}
218
- return
219
- self.special_tokens = dict(
220
- (tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
221
- )
222
- self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
223
- logger.info("Special tokens {}".format(self.special_tokens))
224
-
225
- @lru_cache(maxsize=131072)
226
- def bpe(self, token):
227
- word = tuple(token)
228
- pairs = get_pairs(word)
229
-
230
- if not pairs:
231
- return token
232
-
233
- while True:
234
- bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
235
- if bigram not in self.bpe_ranks:
236
- break
237
- first, second = bigram
238
- new_word = []
239
- i = 0
240
- while i < len(word):
241
- try:
242
- j = word.index(first, i)
243
- new_word.extend(word[i:j])
244
- i = j
245
- except BaseException:
246
- new_word.extend(word[i:])
247
- break
248
-
249
- if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
250
- new_word.append(first + second)
251
- i += 2
252
- else:
253
- new_word.append(word[i])
254
- i += 1
255
- new_word = tuple(new_word)
256
- word = new_word
257
- if len(word) == 1:
258
- break
259
- else:
260
- pairs = get_pairs(word)
261
- word = " ".join(word)
262
- return word
263
-
264
- def tokenize(self, text):
265
- """Tokenize a string."""
266
- bpe_tokens = []
267
- for token in re.findall(self.pat, text):
268
- if sys.version_info[0] == 2:
269
- token = "".join(self.byte_encoder[ord(b)] for b in token)
270
- else:
271
- token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
272
- bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
273
- return bpe_tokens
274
-
275
- def convert_tokens_to_ids(self, tokens):
276
- """Converts a sequence of tokens into ids using the vocab."""
277
- ids = []
278
- if isinstance(tokens, str) or (
279
- sys.version_info[0] == 2 and isinstance(tokens, unicode)
280
- ):
281
- if tokens in self.special_tokens:
282
- return self.special_tokens[tokens]
283
- else:
284
- return self.encoder.get(tokens, 0)
285
- for token in tokens:
286
- if token in self.special_tokens:
287
- ids.append(self.special_tokens[token])
288
- else:
289
- ids.append(self.encoder.get(token, 0))
290
- if len(ids) > self.max_len:
291
- logger.warning(
292
- "Token indices sequence length is longer than the specified maximum "
293
- " sequence length for this OpenAI GPT model ({} > {}). Running this"
294
- " sequence through the model will result in indexing errors".format(
295
- len(ids), self.max_len
296
- )
297
- )
298
- return ids
299
-
300
- def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
301
- """Converts a sequence of ids in BPE tokens using the vocab."""
302
- tokens = []
303
- for i in ids:
304
- if i in self.special_tokens_decoder:
305
- if not skip_special_tokens:
306
- tokens.append(self.special_tokens_decoder[i])
307
- else:
308
- tokens.append(self.decoder[i])
309
- return tokens
310
-
311
- def encode(self, text):
312
- return self.convert_tokens_to_ids(self.tokenize(text))
313
-
314
- def decode(self, tokens):
315
- text = "".join([self.decoder[token] for token in tokens])
316
- text = bytearray([self.byte_decoder[c] for c in text]).decode(
317
- "utf-8", errors=self.errors
318
- )
319
- return text
320
-
321
- def save_vocabulary(self, vocab_path):
322
- """Save the tokenizer vocabulary and merge files to a directory."""
323
- if not os.path.isdir(vocab_path):
324
- logger.error(
325
- "Vocabulary path ({}) should be a directory".format(vocab_path)
326
- )
327
- return
328
- vocab_file = os.path.join(vocab_path, VOCAB_NAME)
329
- merge_file = os.path.join(vocab_path, MERGES_NAME)
330
- special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
331
-
332
- with open(vocab_file, "w", encoding="utf-8") as f:
333
- f.write(json.dumps(self.encoder, ensure_ascii=False))
334
-
335
- index = 0
336
- with open(merge_file, "w", encoding="utf-8") as writer:
337
- writer.write("#version: 0.2\n")
338
- for bpe_tokens, token_index in sorted(
339
- self.bpe_ranks.items(), key=lambda kv: kv[1]
340
- ):
341
- if index != token_index:
342
- logger.warning(
343
- "Saving vocabulary to {}: BPE merge indices are not consecutive."
344
- " Please check that the tokenizer is not corrupted!".format(
345
- merge_file
346
- )
347
- )
348
- index = token_index
349
- writer.write(" ".join(bpe_tokens) + "\n")
350
- index += 1
351
-
352
- index = len(self.encoder)
353
- with open(special_tokens_file, "w", encoding="utf-8") as writer:
354
- for token, token_index in sorted(
355
- self.special_tokens.items(), key=lambda kv: kv[1]
356
- ):
357
- if index != token_index:
358
- logger.warning(
359
- "Saving special tokens vocabulary to {}: BPE indices are not consecutive."
360
- " Please check that the tokenizer is not corrupted!".format(
361
- special_tokens_file
362
- )
363
- )
364
- index = token_index
365
- writer.write(token + "\n")
366
- index += 1
367
-
368
- return vocab_file, merge_file, special_tokens_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py DELETED
@@ -1,402 +0,0 @@
1
- # Copyright (c) 2021, EleutherAI
2
- # This file is based on code by the authors denoted below and has been modified from its original version.
3
- #
4
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Megatron tokenizers."""
19
-
20
- from abc import ABC
21
- from abc import abstractmethod
22
-
23
- from tokenizers import Tokenizer
24
- from transformers import GPT2Tokenizer, GPT2TokenizerFast
25
- import numpy as np
26
- import sentencepiece as spm
27
- from typing import List, Union
28
- from .gpt2_tokenization import GPT2Tokenizer
29
-
30
-
31
- def build_tokenizer(args):
32
- """Initialize tokenizer."""
33
- if args.rank == 0:
34
- print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
35
-
36
- # Select and instantiate the tokenizer.
37
- if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
38
- assert args.vocab_file is not None
39
- assert args.merge_file is not None
40
- tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
41
- elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
42
- assert args.vocab_file is not None
43
- tokenizer = SentencePieceTokenizer(args.vocab_file)
44
- elif args.tokenizer_type.lower() == "HFTokenizer".lower():
45
- assert args.vocab_file is not None
46
- tokenizer = HFTokenizer(args.vocab_file)
47
- elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
48
- if args.vocab_file is None:
49
- print(
50
- "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
51
- )
52
- tokenizer = HFGPT2Tokenizer(args.vocab_file)
53
- elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
54
- tokenizer = CharLevelTokenizer(vocab_size=512)
55
- elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
56
- assert args.vocab_file is not None
57
- tokenizer = TiktokenTokenizer(args.vocab_file)
58
- else:
59
- raise NotImplementedError(
60
- "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
61
- )
62
-
63
- # Add vocab size.
64
- args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
65
-
66
- return tokenizer
67
-
68
-
69
- def _vocab_size_with_padding(orig_vocab_size, args):
70
- """Pad vocab size so it is divisible by model parallel size and
71
- still having GPU friendly size."""
72
-
73
- after = orig_vocab_size
74
- multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
75
- while (after % multiple) != 0:
76
- after += 1
77
- if args.rank == 0:
78
- print(
79
- " > padded vocab (size: {}) with {} dummy tokens "
80
- "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
81
- flush=True,
82
- )
83
- return after
84
-
85
-
86
- class AbstractTokenizer(ABC):
87
- """Abstract class for tokenizer."""
88
-
89
- def __init__(self, name):
90
- self.name = name
91
- super().__init__()
92
-
93
- @property
94
- @abstractmethod
95
- def vocab_size(self):
96
- pass
97
-
98
- @property
99
- @abstractmethod
100
- def vocab(self):
101
- """Dictionary from vocab text token to id token."""
102
- pass
103
-
104
- @property
105
- @abstractmethod
106
- def inv_vocab(self):
107
- """Dictionary from vocab id token to text token."""
108
- pass
109
-
110
- @abstractmethod
111
- def tokenize(self, text):
112
- pass
113
-
114
- def detokenize(self, token_ids):
115
- raise NotImplementedError(
116
- "detokenizer is not implemented for {} " "tokenizer".format(self.name)
117
- )
118
-
119
- @property
120
- def cls(self):
121
- raise NotImplementedError(
122
- "CLS is not provided for {} " "tokenizer".format(self.name)
123
- )
124
-
125
- @property
126
- def sep(self):
127
- raise NotImplementedError(
128
- "SEP is not provided for {} " "tokenizer".format(self.name)
129
- )
130
-
131
- @property
132
- def pad(self):
133
- raise NotImplementedError(
134
- "PAD is not provided for {} " "tokenizer".format(self.name)
135
- )
136
-
137
- @property
138
- def eod(self):
139
- raise NotImplementedError(
140
- "EOD is not provided for {} " "tokenizer".format(self.name)
141
- )
142
-
143
- @property
144
- def mask(self):
145
- raise NotImplementedError(
146
- "MASK is not provided for {} " "tokenizer".format(self.name)
147
- )
148
-
149
-
150
- class _GPT2BPETokenizer(AbstractTokenizer):
151
- """Original GPT2 BPE tokenizer."""
152
-
153
- def __init__(self, vocab_file, merge_file):
154
- name = "GPT2 BPE"
155
- super().__init__(name)
156
-
157
- self.tokenizer = GPT2Tokenizer(
158
- vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
159
- )
160
- self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
161
-
162
- @property
163
- def vocab_size(self):
164
- return len(self.tokenizer.encoder)
165
-
166
- @property
167
- def vocab(self):
168
- return self.tokenizer.encoder
169
-
170
- @property
171
- def inv_vocab(self):
172
- return self.tokenizer.decoder
173
-
174
- def tokenize(self, text):
175
- return self.tokenizer.encode(text)
176
-
177
- def detokenize(self, token_ids):
178
- return self.tokenizer.decode(token_ids)
179
-
180
- @property
181
- def eod(self):
182
- return self.eod_id
183
-
184
-
185
- class SentencePieceTokenizer(AbstractTokenizer):
186
- """Designed to Integrate SP's Tokenizer."""
187
-
188
- def __init__(self, vocab_file):
189
- name = "SPM"
190
- super().__init__(name)
191
-
192
- self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
193
- self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
194
-
195
- @property
196
- def vocab_size(self):
197
- return self.tokenizer.get_piece_size()
198
-
199
- @property
200
- def vocab(self):
201
- return {
202
- self.tokenizer.id_to_piece(idx): idx
203
- for idx in range(self.tokenizer.get_piece_size())
204
- }
205
-
206
- @property
207
- def inv_vocab(self):
208
- return {
209
- idx: self.tokenizer.id_to_piece(idx)
210
- for idx in range(self.tokenizer.get_piece_size())
211
- }
212
-
213
- def tokenize(self, text):
214
- return self.tokenizer.encode(text)
215
-
216
- def detokenize(self, token_ids):
217
- return self.tokenizer.decode(token_ids)
218
-
219
- @property
220
- def eod(self):
221
- return self.eod_id
222
-
223
-
224
- class HFTokenizer(AbstractTokenizer):
225
- """Designed to Integrate HF's Tokenizer library."""
226
-
227
- def __init__(self, vocab_file):
228
- name = "HFTokenizer"
229
- super().__init__(name)
230
-
231
- self.tokenizer = Tokenizer.from_file(vocab_file)
232
- self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
233
- self.pad_id = self.tokenizer.token_to_id("<|padding|>")
234
-
235
- @property
236
- def vocab_size(self):
237
- return self.tokenizer.get_vocab_size()
238
-
239
- @property
240
- def vocab(self):
241
- return self.tokenizer.get_vocab()
242
-
243
- @property
244
- def inv_vocab(self):
245
- return self.tokenizer.decoder
246
-
247
- def tokenize(self, text: str):
248
- return self.tokenizer.encode(text).ids
249
-
250
- def tokenize_batch(self, text_batch: Union[List[str], str]):
251
- return self.tokenizer.encode_batch(text_batch)
252
-
253
- def detokenize(self, token_ids):
254
- return self.tokenizer.decode(token_ids)
255
-
256
- @property
257
- def eod(self):
258
- return self.eod_id
259
-
260
-
261
- class HFGPT2Tokenizer(AbstractTokenizer):
262
- """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
263
-
264
- def __init__(self, vocab_file=None, fast=True):
265
- name = "HFGPT2Tokenizer"
266
- if fast:
267
- name += "Fast"
268
- super().__init__(name)
269
- if vocab_file is None:
270
- vocab_file = "gpt2"
271
- if fast:
272
- self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
273
- else:
274
- self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
275
-
276
- self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
277
- self.eod_id = self.tokenizer.eos_token_id
278
- self.pad_id = self.tokenizer.pad_token_id
279
-
280
- @property
281
- def vocab_size(self):
282
- return len(self.tokenizer)
283
-
284
- @property
285
- def vocab(self):
286
- return self.tokenizer.get_vocab()
287
-
288
- @property
289
- def inv_vocab(self):
290
- return self.tokenizer._tokenizer.decoder
291
-
292
- def tokenize(self, text: str):
293
- return self.tokenizer.encode(text)
294
-
295
- def tokenize_batch(self, text_batch: Union[List[str], str]):
296
- if isinstance(text_batch, str):
297
- text_batch = [text_batch]
298
- return [self.tokenize(t) for t in text_batch]
299
-
300
- def detokenize(self, token_ids):
301
- return self.tokenizer.decode(token_ids)
302
-
303
- @property
304
- def eod(self):
305
- return self.eod_id
306
-
307
-
308
- class CharLevelTokenizer(AbstractTokenizer):
309
- """Character Level Tokenizer"""
310
-
311
- def __init__(self, vocab_size):
312
- name = "CharLevelTokenizer"
313
- super().__init__(name)
314
- self._vocab_size = vocab_size
315
- self.eod_id = 0
316
- self.pad_id = 1
317
-
318
- def clamp(self, n):
319
- return max(32, min(n, self.vocab_size))
320
-
321
- @property
322
- def vocab_size(self):
323
- return self._vocab_size
324
-
325
- @property
326
- def vocab(self):
327
- raise NotImplementedError
328
-
329
- @property
330
- def inv_vocab(self):
331
- raise NotImplementedError
332
-
333
- def decode_token(self, token: int):
334
- return str(chr(self.clamp(token)))
335
-
336
- def tokenize(self, text: str):
337
- return list(np.fromstring(text, dtype=np.uint8))
338
-
339
- def tokenize_batch(self, text_batch: Union[List[str], str]):
340
- if isinstance(text_batch, list):
341
- return [self.tokenize(s) for s in text_batch]
342
- else:
343
- return self.tokenize(text_batch)
344
-
345
- def detokenize(self, token_ids):
346
- return "".join(list(map(self.decode_token, token_ids)))
347
-
348
- @property
349
- def eod(self):
350
- return self.eod_id
351
-
352
-
353
- class TiktokenTokenizer(AbstractTokenizer):
354
- """Tokenizer from OpenAI's tiktoken implementation"""
355
-
356
- def __init__(self, vocab_file):
357
- try:
358
- import tiktoken
359
- except ModuleNotFoundError:
360
- print("Please install tiktoken: (https://github.com/openai/tiktoken)")
361
- raise Exception
362
-
363
- name = "TiktokenTokenizer"
364
- super().__init__(name)
365
-
366
- self.tokenizer = tiktoken.get_encoding(vocab_file)
367
- self.eod_id = self.tokenizer.eot_token
368
- self.pad_id = None
369
-
370
- @property
371
- def vocab_size(self):
372
- return self.tokenizer.n_vocab
373
-
374
- @property
375
- def vocab(self):
376
- raise NotImplementedError(
377
- "TiktokenTokenizer does not implement vocabulary access."
378
- )
379
-
380
- @property
381
- def inv_vocab(self):
382
- raise NotImplementedError(
383
- "TiktokenTokenizer does not implement vocabulary access. \
384
- To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
385
- )
386
-
387
- def tokenize(self, text: str):
388
- return self.tokenizer.encode(text) # , allowed_special="all")
389
-
390
- def tokenize_batch(self, text_batch: List[str]):
391
- return self.tokenizer.encode_batch(text_batch, allowed_special="all")
392
-
393
- def detokenize(self, token_ids):
394
- return self.tokenizer.decode(tokens=token_ids, errors="strict")
395
-
396
- @property
397
- def eod(self):
398
- return self.eod_id
399
-
400
- @property
401
- def pad(self):
402
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py DELETED
@@ -1,126 +0,0 @@
1
- # Copyright (c) 2021, EleutherAI
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """
16
- Assumes a dataset of jsonl files in the same format as the neox training set.
17
- """
18
-
19
- from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
20
- from tokenizers.normalizers import NFKC
21
-
22
- from glob import glob
23
- import os
24
- import json
25
- import argparse
26
-
27
-
28
- def load_jsonl(input_path, quiet=True) -> list:
29
- """
30
- Read list of objects from a JSON lines file.
31
- """
32
- data = []
33
- with open(input_path, "r", encoding="utf-8") as f:
34
- for line in f:
35
- data.append(json.loads(line.rstrip("\n|\r")))
36
- if not quiet:
37
- print("Loaded {} records from {}".format(len(data), input_path))
38
- return data
39
-
40
-
41
- def json_iterator(input_dir, text_key="text"):
42
- all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
43
- for j in all_jsonls:
44
- data = load_jsonl(j)
45
- for doc in data:
46
- yield doc[text_key]
47
-
48
-
49
- def train_tokenizer(
50
- input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
51
- ):
52
- """
53
- Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
54
-
55
- :param input_dir: input directory containing jsonl files
56
- :param save_path: path to save tokenizer to
57
- :param tokenizer_type: type of tokenizer to train.
58
- :param vocab_size: int, size of tokenizer's vocab
59
- :return:
60
- """
61
-
62
- if tokenizer_type == "BPE":
63
- model = models.BPE()
64
- else:
65
- raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
66
- tokenizer = Tokenizer(model)
67
-
68
- # Customize pre-tokenization and decoding
69
- tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
70
- tokenizer.decoder = decoders.ByteLevel()
71
- tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
72
- tokenizer.normalizer = NFKC()
73
-
74
- # And then train
75
- trainer = trainers.BpeTrainer(
76
- vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
77
- )
78
- tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
79
-
80
- # And Save it
81
- tokenizer.save(save_path, pretty=True)
82
- print(f"Tokenizer saved at {save_path}")
83
-
84
-
85
- def parse_args():
86
- parser = argparse.ArgumentParser(
87
- description="script for training a multilingual "
88
- "HF tokenizer on CC dumps with upweighting for low resource languages"
89
- )
90
- parser.add_argument(
91
- "--json_input_dir",
92
- type=str,
93
- help="Path to folder containing tokenizer training data in jsonl format",
94
- )
95
- parser.add_argument(
96
- "--tokenizer_output_path",
97
- type=str,
98
- help="Path to which your trained tokenizer will be saved (should end in .json)",
99
- )
100
- parser.add_argument(
101
- "--tokenizer_type",
102
- type=str,
103
- help="type of tokenizer to train, currently only BPE is supported",
104
- choices=["BPE"],
105
- default=["BPE"],
106
- )
107
- parser.add_argument(
108
- "-v",
109
- "--vocab_size",
110
- help="vocabulary size of tokenizer, default=52k",
111
- type=int,
112
- default=52000,
113
- )
114
- return parser.parse_args()
115
-
116
-
117
- if __name__ == "__main__":
118
-
119
- args = parse_args()
120
-
121
- train_tokenizer(
122
- args.json_input_dir,
123
- save_path=args.tokenizer_output_path,
124
- tokenizer_type=args.tokenizer_type,
125
- vocab_size=args.vocab_size,
126
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/gpt_neox_chinese_v1/trouble-shooting.md DELETED
@@ -1,22 +0,0 @@
1
-
2
-
3
- ## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
4
-
5
-
6
-
7
-
8
- ## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
9
-
10
-
11
- ```
12
- The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
13
- The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
14
- The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
15
- ```
16
-
17
-
18
- 原因:50254 这些token并未在vocab中定义,只在 `added_tokens` 里定义了。
19
-
20
- ## ss
21
-
22
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/moss/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
  import os
3
- from transformers import AutoTokenizer, BloomTokenizerFast
4
 
5
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
6
  TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
 
1
 
2
  import os
3
+ from transformers import AutoTokenizer
4
 
5
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
6
  TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
vocab/text_davinci_003/__init__.py CHANGED
@@ -1,70 +1,25 @@
1
  """
2
- TODO
3
- """
4
-
5
- import tiktoken
6
- from tiktoken import Encoding
7
- from utils.log_util import logger
8
-
9
- tokenizer = tiktoken.encoding_for_model('text-davinci-003')
10
- tokenizer.vocab_size = tokenizer.n_vocab
11
-
12
- tokenizer.comments = ""
13
- tokenizer.reversible = True
14
-
15
 
16
 
17
 
18
- def decode(self, tokens, errors="replace", skip_special_tokens=False):
19
- """
20
- 默认的decode,可能会报错,详见 decode_test.py
21
- skip_special_tokens 是为了兼容 hf_tokenizer
22
- """
23
- try:
24
- decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
25
- except:
26
- decode_str = "null"
27
- return decode_str
28
 
29
- def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
30
- """
31
- 为什么没有这个方法?
32
- """
33
- try:
34
- return tokenizer.decode_tokens_bytes(tokens)
35
- except:
36
- # 什么要返回None?见zh_util.py
37
- # 16个空闲id, 100256 100261-100275
38
- return [None for token in tokens]
39
 
40
- def get_vocab(self, token_type="str"):
41
- """Returns vocab as a dict
42
- :param token_type: ["str", "byte"]
43
- :return:
44
- """
45
- vocab = {}
46
- key_error_list = []
47
- unicode_decode_error_list = []
48
- for i in range(self.vocab_size):
49
- try:
50
- token_byte = self.convert_ids_to_tokens([i])[0]
51
- if token_byte is None:
52
- continue
53
- # token_str = token_byte.decode("utf-8")
54
- vocab[token_byte] = i
55
 
56
- except UnicodeDecodeError: # 773 UnicodeDecodeError
57
- unicode_decode_error_list.append((i, str(token_byte)))
58
- vocab[token_byte] = i
59
 
60
- # vocab.update(self.added_tokens_encoder)
61
- logger.info(f"text-davinci-003 {len(key_error_list)} KeyError: {key_error_list}")
62
- logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
63
- return vocab
64
 
 
 
65
 
 
 
66
 
67
- # tiktoken patch
68
- Encoding.decode = decode
69
- Encoding.convert_ids_to_tokens = convert_ids_to_tokens
70
- Encoding.get_vocab = get_vocab
 
1
  """
2
+ ,请
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
 
6
+ ## tiktoken API
 
 
 
 
 
 
 
 
 
7
 
8
+ tokens = enc.encode("hello world")
9
+ assert enc.decode(tokens) == "hello world"
10
+ assert enc.decode_bytes(tokens) == b"hello world"
11
+ assert enc.decode_tokens_bytes(tokens) == [b"hello", b" world"]
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ decode_single_token_bytes
15
+ """
 
16
 
17
+ import tiktoken
18
+ import tokenizer.tiktoken_patch
 
 
19
 
20
+ tokenizer = tiktoken.encoding_for_model('text-davinci-003')
21
+ tokenizer.vocab_size = tokenizer.n_vocab
22
 
23
+ tokenizer.comments = ""
24
+ tokenizer.reversible = True
25