mtasic85 commited on
Commit
aa0559e
·
1 Parent(s): 8c82fc3
scripts/train_tokenizer.py CHANGED
@@ -1,89 +1,139 @@
 
1
  import string
2
 
3
  from datasets import load_dataset
4
- from tokenizers import Tokenizer, models, pre_tokenizers, trainers
5
- from tokenizers.processors import TemplateProcessing
6
  from transformers import PreTrainedTokenizerFast
7
-
8
- # dataset_0 = (
9
- # load_dataset('wikimedia/wikisource', lang, split='train')
10
- # for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
11
- # )
12
-
13
- dataset_1 = (
14
- load_dataset('xu-song/cc100-samples', lang, split='train')
15
- for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
16
- )
17
-
18
- dataset_2 = (
19
- load_dataset('csebuetnlp/xlsum', lang, split='train')
20
- for lang in ['amharic', 'arabic', 'azerbaijani', 'bengali', 'burmese', 'chinese_simplified', 'chinese_traditional', 'english', 'french', 'gujarati', 'hausa', 'hindi', 'igbo', 'indonesian', 'japanese', 'kirundi', 'korean', 'kyrgyz', 'marathi', 'nepali', 'oromo', 'pashto', 'persian', 'pidgin', 'portuguese', 'punjabi', 'russian', 'scottish_gaelic', 'serbian_cyrillic', 'serbian_latin', 'sinhala', 'somali', 'spanish', 'swahili', 'tamil', 'telugu', 'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek', 'vietnamese', 'welsh', 'yoruba']
21
- )
22
-
23
- # dataset_3 = load_dataset('recursal/SuperWikiNEXT-32B', split='train')
24
- dataset_4 = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train')
25
- dataset_5 = load_dataset('nampdn-ai/tiny-codes', split='train')
26
- # dataset_6 = load_dataset('ajibawa-2023/Maths-College', split='train')
27
- dataset_7 = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
28
- dataset_8 = load_dataset('mlabonne/FineTome-100k', split='train')
29
- dataset_9 = load_dataset('arcee-ai/agent-data', split='train')
30
- dataset_10 = [
31
- load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
32
- load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
33
- ]
34
- dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
35
 
36
 
37
  def batch_iterator():
 
 
 
 
 
 
 
 
38
  # for d in dataset_0:
39
  # for row in d['text']:
40
  # yield row
 
 
 
 
 
 
 
 
41
 
42
  for d in dataset_1:
43
  for row in d['text']:
44
  yield row
45
 
46
- for d in dataset_2:
47
- for row in d['text']:
48
- yield row
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # for row in dataset_3['text']:
51
  # yield row
 
 
 
 
 
52
 
53
  for row in dataset_4:
54
  yield row['query'] + '\n' + row['answer']
55
 
56
- for row in dataset_5:
57
- yield row['prompt'] + '\n' + row['response']
 
 
 
 
 
 
 
 
58
 
 
 
59
  # for row in dataset_6:
60
  # yield row['instruction'] + '\n' + row['output']
 
 
 
 
 
61
 
62
  for row in dataset_7:
63
  yield row['question'] + '\n' + row['answer']
64
 
65
- for row in dataset_8['conversations']:
66
- yield '\n'.join(n['value'] for n in row)
67
-
68
- for row in dataset_9['conversations']:
69
- yield '\n'.join(n['value'] for n in row)
70
 
71
- for d in dataset_10:
72
- for row in d['messages']:
73
- yield '\n'.join(n['content'] for n in row)
74
 
75
- for row in dataset_11:
76
- yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  special_tokens = [
 
80
  '<s>',
81
  '</s>',
82
- '<pad>',
83
- '<unk>',
84
- '<mask>',
85
- '<|im_start|>',
86
  '<|im_end|>',
 
87
  '<tools>',
88
  '</tools>',
89
  '<tool_call>',
@@ -101,32 +151,43 @@ for i in range(2, 25):
101
  for i in range(64 - len(special_tokens)):
102
  special_tokens.append(f'<|reserved_{i}|>')
103
 
104
- ascii_chars = string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation
 
 
 
 
 
 
105
 
106
- #
107
- # tokenizer
108
- #
109
- tokenizer = Tokenizer(models.BPE())
110
 
111
- # set up pre-tokenizer to split on whitespace and punctuation
112
- tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
113
- pre_tokenizers.WhitespaceSplit(),
114
- pre_tokenizers.Punctuation(),
 
115
  ])
116
 
117
- # trainer
118
- trainer = trainers.BpeTrainer(
 
 
 
 
 
119
  vocab_size=32064,
 
120
  special_tokens=special_tokens,
121
- initial_alphabet=list(ascii_chars),
122
  )
123
 
124
- # train the tokenizer
125
- tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
 
126
 
127
- #
128
- # fast_tokenizer
129
- #
130
  CHATML_CHAT_TEMPLATE = (
131
  "{% for message in messages %}"
132
  "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
@@ -140,10 +201,12 @@ fast_tokenizer = PreTrainedTokenizerFast(
140
  tokenizer_object=tokenizer,
141
  chat_template=CHATML_CHAT_TEMPLATE,
142
  bos_token='<s>',
143
- eos_token='</s>',
144
  unk_token='<unk>',
145
- pad_token='<pad>',
146
- mask_token='<mask>',
 
 
147
  )
148
 
149
- fast_tokenizer.save_pretrained('..')
 
1
+ import gc
2
  import string
3
 
4
  from datasets import load_dataset
 
 
5
  from transformers import PreTrainedTokenizerFast
6
+ from tokenizers import Tokenizer, normalizers, decoders
7
+ from tokenizers.models import BPE
8
+ from tokenizers.trainers import BpeTrainer
9
+ from tokenizers.processors import TemplateProcessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def batch_iterator():
13
+ # yield 'На вр брда врба мрда.\nНема струје због олује.'
14
+ # return
15
+
16
+ # dataset_0 = (
17
+ # load_dataset('wikimedia/wikisource', lang, split='train')
18
+ # for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
19
+ # )
20
+ #
21
  # for d in dataset_0:
22
  # for row in d['text']:
23
  # yield row
24
+ #
25
+ # del dataset_0
26
+ # gc.collect()
27
+
28
+ dataset_1 = (
29
+ load_dataset('xu-song/cc100-samples', lang, split='train')
30
+ for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
31
+ )
32
 
33
  for d in dataset_1:
34
  for row in d['text']:
35
  yield row
36
 
37
+ del dataset_1
38
+ gc.collect()
 
39
 
40
+ # dataset_2 = (
41
+ # load_dataset('csebuetnlp/xlsum', lang, split='train')
42
+ # for lang in ['amharic', 'arabic', 'azerbaijani', 'bengali', 'burmese', 'chinese_simplified', 'chinese_traditional', 'english', 'french', 'gujarati', 'hausa', 'hindi', 'igbo', 'indonesian', 'japanese', 'kirundi', 'korean', 'kyrgyz', 'marathi', 'nepali', 'oromo', 'pashto', 'persian', 'pidgin', 'portuguese', 'punjabi', 'russian', 'scottish_gaelic', 'serbian_cyrillic', 'serbian_latin', 'sinhala', 'somali', 'spanish', 'swahili', 'tamil', 'telugu', 'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek', 'vietnamese', 'welsh', 'yoruba']
43
+ # )
44
+ #
45
+ # for d in dataset_2:
46
+ # for row in d['text']:
47
+ # yield row
48
+ #
49
+ # del dataset_2
50
+ # gc.collect()
51
+
52
+ # dataset_3 = load_dataset('recursal/SuperWikiNEXT-32B', split='train')
53
+ #
54
  # for row in dataset_3['text']:
55
  # yield row
56
+ #
57
+ # del dataset_3
58
+ # gc.collect()
59
+
60
+ dataset_4 = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train')
61
 
62
  for row in dataset_4:
63
  yield row['query'] + '\n' + row['answer']
64
 
65
+ del dataset_4
66
+ gc.collect()
67
+
68
+ # dataset_5 = load_dataset('nampdn-ai/tiny-codes', split='train')
69
+ #
70
+ # for row in dataset_5:
71
+ # yield row['prompt'] + '\n' + row['response']
72
+ #
73
+ # del dataset_5
74
+ # gc.collect()
75
 
76
+ # dataset_6 = load_dataset('ajibawa-2023/Maths-College', split='train')
77
+ #
78
  # for row in dataset_6:
79
  # yield row['instruction'] + '\n' + row['output']
80
+ #
81
+ # del dataset_6
82
+ # gc.collect()
83
+
84
+ dataset_7 = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
85
 
86
  for row in dataset_7:
87
  yield row['question'] + '\n' + row['answer']
88
 
89
+ del dataset_7
90
+ gc.collect()
 
 
 
91
 
92
+ dataset_8 = load_dataset('mlabonne/FineTome-100k', split='train')
 
 
93
 
94
+ for row in dataset_8['conversations']:
95
+ yield '\n'.join(n['value'] for n in row)
96
 
97
+ del dataset_8
98
+ gc.collect()
99
+
100
+ # dataset_9 = load_dataset('arcee-ai/agent-data', split='train')
101
+ #
102
+ # for row in dataset_9['conversations']:
103
+ # yield '\n'.join(n['value'] for n in row)
104
+ #
105
+ # del dataset_9
106
+ # gc.collect()
107
+
108
+ # dataset_10 = (
109
+ # load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
110
+ # load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
111
+ # )
112
+ #
113
+ # for d in dataset_10:
114
+ # for row in d['messages']:
115
+ # yield '\n'.join(n['content'] for n in row)
116
+ #
117
+ # del dataset_10
118
+ # gc.collect()
119
+
120
+ # dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
121
+ #
122
+ # for row in dataset_11:
123
+ # yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
124
+ #
125
+ # del dataset_11
126
+ # gc.collect()
127
+
128
+ bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
129
+ tokenizer = Tokenizer(bpe)
130
 
131
  special_tokens = [
132
+ '<unk>',
133
  '<s>',
134
  '</s>',
 
 
 
 
135
  '<|im_end|>',
136
+ '<|im_start|>',
137
  '<tools>',
138
  '</tools>',
139
  '<tool_call>',
 
151
  for i in range(64 - len(special_tokens)):
152
  special_tokens.append(f'<|reserved_{i}|>')
153
 
154
+ tokenizer.add_special_tokens(special_tokens)
155
+
156
+ ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
157
+
158
+ dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
159
+ emoji_chars = [row['character'] for row in dataset_11 if len(row['character']) == 1]
160
+ del dataset_11
161
 
162
+ tokenizer.normalizer = normalizers.Sequence([
163
+ normalizers.Prepend("▁"),
164
+ normalizers.Replace(" ", "▁"),
165
+ ])
166
 
167
+ tokenizer.decoder = decoders.Sequence([
168
+ decoders.Replace("▁", " "),
169
+ decoders.ByteFallback(),
170
+ decoders.Fuse(),
171
+ decoders.Strip(' ', 1, 0),
172
  ])
173
 
174
+ tokenizer.post_processor = TemplateProcessing(
175
+ single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences
176
+ pair='$A:0 $B:1', # For pairs, we specify type IDs for both tokens
177
+ special_tokens=[],
178
+ )
179
+
180
+ trainer = BpeTrainer(
181
  vocab_size=32064,
182
+ min_frequency=2,
183
  special_tokens=special_tokens,
184
+ initial_alphabet=ascii_chars + emoji_chars,
185
  )
186
 
187
+ tokenizer.train_from_iterator(batch_iterator(), trainer)
188
+ tokenizer.save('../tokenizer.json')
189
+ tokenizer.model.save('../')
190
 
 
 
 
191
  CHATML_CHAT_TEMPLATE = (
192
  "{% for message in messages %}"
193
  "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
 
201
  tokenizer_object=tokenizer,
202
  chat_template=CHATML_CHAT_TEMPLATE,
203
  bos_token='<s>',
204
+ eos_token='<|im_end|>',
205
  unk_token='<unk>',
206
+ pad_token='</s>',
207
+ clean_up_tokenization_spaces=False,
208
+ spaces_between_special_tokens=False,
209
+ use_default_system_prompt=False,
210
  )
211
 
212
+ fast_tokenizer.save_pretrained('../')
special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "bos_token": "<s>",
3
- "eos_token": "</s>",
4
- "mask_token": "<mask>",
5
- "pad_token": "<pad>",
6
- "unk_token": "<unk>"
7
- }
 
 
 
 
 
 
 
 
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json DELETED
@@ -1,525 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "</s>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "<pad>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "<mask>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "<|im_start|>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "<|im_end|>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "7": {
60
- "content": "<tools>",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- },
67
- "8": {
68
- "content": "</tools>",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false,
73
- "special": true
74
- },
75
- "9": {
76
- "content": "<tool_call>",
77
- "lstrip": false,
78
- "normalized": false,
79
- "rstrip": false,
80
- "single_word": false,
81
- "special": true
82
- },
83
- "10": {
84
- "content": "</tool_call>",
85
- "lstrip": false,
86
- "normalized": false,
87
- "rstrip": false,
88
- "single_word": false,
89
- "special": true
90
- },
91
- "11": {
92
- "content": "<tool_response>",
93
- "lstrip": false,
94
- "normalized": false,
95
- "rstrip": false,
96
- "single_word": false,
97
- "special": true
98
- },
99
- "12": {
100
- "content": "</tool_response>",
101
- "lstrip": false,
102
- "normalized": false,
103
- "rstrip": false,
104
- "single_word": false,
105
- "special": true
106
- },
107
- "13": {
108
- "content": "system",
109
- "lstrip": false,
110
- "normalized": false,
111
- "rstrip": false,
112
- "single_word": false,
113
- "special": true
114
- },
115
- "14": {
116
- "content": "user",
117
- "lstrip": false,
118
- "normalized": false,
119
- "rstrip": false,
120
- "single_word": false,
121
- "special": true
122
- },
123
- "15": {
124
- "content": "assistant",
125
- "lstrip": false,
126
- "normalized": false,
127
- "rstrip": false,
128
- "single_word": false,
129
- "special": true
130
- },
131
- "16": {
132
- "content": " ",
133
- "lstrip": false,
134
- "normalized": false,
135
- "rstrip": false,
136
- "single_word": false,
137
- "special": true
138
- },
139
- "17": {
140
- "content": " ",
141
- "lstrip": false,
142
- "normalized": false,
143
- "rstrip": false,
144
- "single_word": false,
145
- "special": true
146
- },
147
- "18": {
148
- "content": " ",
149
- "lstrip": false,
150
- "normalized": false,
151
- "rstrip": false,
152
- "single_word": false,
153
- "special": true
154
- },
155
- "19": {
156
- "content": " ",
157
- "lstrip": false,
158
- "normalized": false,
159
- "rstrip": false,
160
- "single_word": false,
161
- "special": true
162
- },
163
- "20": {
164
- "content": " ",
165
- "lstrip": false,
166
- "normalized": false,
167
- "rstrip": false,
168
- "single_word": false,
169
- "special": true
170
- },
171
- "21": {
172
- "content": " ",
173
- "lstrip": false,
174
- "normalized": false,
175
- "rstrip": false,
176
- "single_word": false,
177
- "special": true
178
- },
179
- "22": {
180
- "content": " ",
181
- "lstrip": false,
182
- "normalized": false,
183
- "rstrip": false,
184
- "single_word": false,
185
- "special": true
186
- },
187
- "23": {
188
- "content": " ",
189
- "lstrip": false,
190
- "normalized": false,
191
- "rstrip": false,
192
- "single_word": false,
193
- "special": true
194
- },
195
- "24": {
196
- "content": " ",
197
- "lstrip": false,
198
- "normalized": false,
199
- "rstrip": false,
200
- "single_word": false,
201
- "special": true
202
- },
203
- "25": {
204
- "content": " ",
205
- "lstrip": false,
206
- "normalized": false,
207
- "rstrip": false,
208
- "single_word": false,
209
- "special": true
210
- },
211
- "26": {
212
- "content": " ",
213
- "lstrip": false,
214
- "normalized": false,
215
- "rstrip": false,
216
- "single_word": false,
217
- "special": true
218
- },
219
- "27": {
220
- "content": " ",
221
- "lstrip": false,
222
- "normalized": false,
223
- "rstrip": false,
224
- "single_word": false,
225
- "special": true
226
- },
227
- "28": {
228
- "content": " ",
229
- "lstrip": false,
230
- "normalized": false,
231
- "rstrip": false,
232
- "single_word": false,
233
- "special": true
234
- },
235
- "29": {
236
- "content": " ",
237
- "lstrip": false,
238
- "normalized": false,
239
- "rstrip": false,
240
- "single_word": false,
241
- "special": true
242
- },
243
- "30": {
244
- "content": " ",
245
- "lstrip": false,
246
- "normalized": false,
247
- "rstrip": false,
248
- "single_word": false,
249
- "special": true
250
- },
251
- "31": {
252
- "content": " ",
253
- "lstrip": false,
254
- "normalized": false,
255
- "rstrip": false,
256
- "single_word": false,
257
- "special": true
258
- },
259
- "32": {
260
- "content": " ",
261
- "lstrip": false,
262
- "normalized": false,
263
- "rstrip": false,
264
- "single_word": false,
265
- "special": true
266
- },
267
- "33": {
268
- "content": " ",
269
- "lstrip": false,
270
- "normalized": false,
271
- "rstrip": false,
272
- "single_word": false,
273
- "special": true
274
- },
275
- "34": {
276
- "content": " ",
277
- "lstrip": false,
278
- "normalized": false,
279
- "rstrip": false,
280
- "single_word": false,
281
- "special": true
282
- },
283
- "35": {
284
- "content": " ",
285
- "lstrip": false,
286
- "normalized": false,
287
- "rstrip": false,
288
- "single_word": false,
289
- "special": true
290
- },
291
- "36": {
292
- "content": " ",
293
- "lstrip": false,
294
- "normalized": false,
295
- "rstrip": false,
296
- "single_word": false,
297
- "special": true
298
- },
299
- "37": {
300
- "content": " ",
301
- "lstrip": false,
302
- "normalized": false,
303
- "rstrip": false,
304
- "single_word": false,
305
- "special": true
306
- },
307
- "38": {
308
- "content": " ",
309
- "lstrip": false,
310
- "normalized": false,
311
- "rstrip": false,
312
- "single_word": false,
313
- "special": true
314
- },
315
- "39": {
316
- "content": "<|reserved_0|>",
317
- "lstrip": false,
318
- "normalized": false,
319
- "rstrip": false,
320
- "single_word": false,
321
- "special": true
322
- },
323
- "40": {
324
- "content": "<|reserved_1|>",
325
- "lstrip": false,
326
- "normalized": false,
327
- "rstrip": false,
328
- "single_word": false,
329
- "special": true
330
- },
331
- "41": {
332
- "content": "<|reserved_2|>",
333
- "lstrip": false,
334
- "normalized": false,
335
- "rstrip": false,
336
- "single_word": false,
337
- "special": true
338
- },
339
- "42": {
340
- "content": "<|reserved_3|>",
341
- "lstrip": false,
342
- "normalized": false,
343
- "rstrip": false,
344
- "single_word": false,
345
- "special": true
346
- },
347
- "43": {
348
- "content": "<|reserved_4|>",
349
- "lstrip": false,
350
- "normalized": false,
351
- "rstrip": false,
352
- "single_word": false,
353
- "special": true
354
- },
355
- "44": {
356
- "content": "<|reserved_5|>",
357
- "lstrip": false,
358
- "normalized": false,
359
- "rstrip": false,
360
- "single_word": false,
361
- "special": true
362
- },
363
- "45": {
364
- "content": "<|reserved_6|>",
365
- "lstrip": false,
366
- "normalized": false,
367
- "rstrip": false,
368
- "single_word": false,
369
- "special": true
370
- },
371
- "46": {
372
- "content": "<|reserved_7|>",
373
- "lstrip": false,
374
- "normalized": false,
375
- "rstrip": false,
376
- "single_word": false,
377
- "special": true
378
- },
379
- "47": {
380
- "content": "<|reserved_8|>",
381
- "lstrip": false,
382
- "normalized": false,
383
- "rstrip": false,
384
- "single_word": false,
385
- "special": true
386
- },
387
- "48": {
388
- "content": "<|reserved_9|>",
389
- "lstrip": false,
390
- "normalized": false,
391
- "rstrip": false,
392
- "single_word": false,
393
- "special": true
394
- },
395
- "49": {
396
- "content": "<|reserved_10|>",
397
- "lstrip": false,
398
- "normalized": false,
399
- "rstrip": false,
400
- "single_word": false,
401
- "special": true
402
- },
403
- "50": {
404
- "content": "<|reserved_11|>",
405
- "lstrip": false,
406
- "normalized": false,
407
- "rstrip": false,
408
- "single_word": false,
409
- "special": true
410
- },
411
- "51": {
412
- "content": "<|reserved_12|>",
413
- "lstrip": false,
414
- "normalized": false,
415
- "rstrip": false,
416
- "single_word": false,
417
- "special": true
418
- },
419
- "52": {
420
- "content": "<|reserved_13|>",
421
- "lstrip": false,
422
- "normalized": false,
423
- "rstrip": false,
424
- "single_word": false,
425
- "special": true
426
- },
427
- "53": {
428
- "content": "<|reserved_14|>",
429
- "lstrip": false,
430
- "normalized": false,
431
- "rstrip": false,
432
- "single_word": false,
433
- "special": true
434
- },
435
- "54": {
436
- "content": "<|reserved_15|>",
437
- "lstrip": false,
438
- "normalized": false,
439
- "rstrip": false,
440
- "single_word": false,
441
- "special": true
442
- },
443
- "55": {
444
- "content": "<|reserved_16|>",
445
- "lstrip": false,
446
- "normalized": false,
447
- "rstrip": false,
448
- "single_word": false,
449
- "special": true
450
- },
451
- "56": {
452
- "content": "<|reserved_17|>",
453
- "lstrip": false,
454
- "normalized": false,
455
- "rstrip": false,
456
- "single_word": false,
457
- "special": true
458
- },
459
- "57": {
460
- "content": "<|reserved_18|>",
461
- "lstrip": false,
462
- "normalized": false,
463
- "rstrip": false,
464
- "single_word": false,
465
- "special": true
466
- },
467
- "58": {
468
- "content": "<|reserved_19|>",
469
- "lstrip": false,
470
- "normalized": false,
471
- "rstrip": false,
472
- "single_word": false,
473
- "special": true
474
- },
475
- "59": {
476
- "content": "<|reserved_20|>",
477
- "lstrip": false,
478
- "normalized": false,
479
- "rstrip": false,
480
- "single_word": false,
481
- "special": true
482
- },
483
- "60": {
484
- "content": "<|reserved_21|>",
485
- "lstrip": false,
486
- "normalized": false,
487
- "rstrip": false,
488
- "single_word": false,
489
- "special": true
490
- },
491
- "61": {
492
- "content": "<|reserved_22|>",
493
- "lstrip": false,
494
- "normalized": false,
495
- "rstrip": false,
496
- "single_word": false,
497
- "special": true
498
- },
499
- "62": {
500
- "content": "<|reserved_23|>",
501
- "lstrip": false,
502
- "normalized": false,
503
- "rstrip": false,
504
- "single_word": false,
505
- "special": true
506
- },
507
- "63": {
508
- "content": "<|reserved_24|>",
509
- "lstrip": false,
510
- "normalized": false,
511
- "rstrip": false,
512
- "single_word": false,
513
- "special": true
514
- }
515
- },
516
- "bos_token": "<s>",
517
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
518
- "clean_up_tokenization_spaces": true,
519
- "eos_token": "</s>",
520
- "mask_token": "<mask>",
521
- "model_max_length": 1000000000000000019884624838656,
522
- "pad_token": "<pad>",
523
- "tokenizer_class": "PreTrainedTokenizerFast",
524
- "unk_token": "<unk>"
525
- }