tokenizer

Browse files

Files changed (4) hide show

scripts/train_tokenizer.py +133 -70
special_tokens_map.json +0 -7
tokenizer.json +0 -0
tokenizer_config.json +0 -525

scripts/train_tokenizer.py CHANGED Viewed

@@ -1,89 +1,139 @@
 import string
 from datasets import load_dataset
-from tokenizers import Tokenizer, models, pre_tokenizers, trainers
-from tokenizers.processors import TemplateProcessing
 from transformers import PreTrainedTokenizerFast
-# dataset_0 = (
-#     load_dataset('wikimedia/wikisource', lang, split='train')
-#     for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
-# )
-dataset_1 = (
-    load_dataset('xu-song/cc100-samples', lang, split='train')
-    for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
-)
-dataset_2 = (
-    load_dataset('csebuetnlp/xlsum', lang, split='train')
-    for lang in ['amharic', 'arabic', 'azerbaijani', 'bengali', 'burmese', 'chinese_simplified', 'chinese_traditional', 'english', 'french', 'gujarati', 'hausa', 'hindi', 'igbo', 'indonesian', 'japanese', 'kirundi', 'korean', 'kyrgyz', 'marathi', 'nepali', 'oromo', 'pashto', 'persian', 'pidgin', 'portuguese', 'punjabi', 'russian', 'scottish_gaelic', 'serbian_cyrillic', 'serbian_latin', 'sinhala', 'somali', 'spanish', 'swahili', 'tamil', 'telugu', 'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek', 'vietnamese', 'welsh', 'yoruba']
-)
-# dataset_3 = load_dataset('recursal/SuperWikiNEXT-32B', split='train')
-dataset_4 = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train')
-dataset_5 = load_dataset('nampdn-ai/tiny-codes', split='train')
-# dataset_6 = load_dataset('ajibawa-2023/Maths-College', split='train')
-dataset_7 = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
-dataset_8 = load_dataset('mlabonne/FineTome-100k', split='train')
-dataset_9 = load_dataset('arcee-ai/agent-data', split='train')
-dataset_10 = [
-    load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
-    load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
-]
-dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
 def batch_iterator():
     # for d in dataset_0:
     #     for row in d['text']:
     #         yield row
     for d in dataset_1:
         for row in d['text']:
             yield row
-    for d in dataset_2:
-        for row in d['text']:
-            yield row
     # for row in dataset_3['text']:
     #     yield row
     for row in dataset_4:
         yield row['query'] + '\n' + row['answer']
-    for row in dataset_5:
-        yield row['prompt'] + '\n' + row['response']
     # for row in dataset_6:
     #     yield row['instruction'] + '\n' + row['output']
     for row in dataset_7:
         yield row['question'] + '\n' + row['answer']
-    for row in dataset_8['conversations']:
-        yield '\n'.join(n['value'] for n in row)
-    for row in dataset_9['conversations']:
-        yield '\n'.join(n['value'] for n in row)
-    for d in dataset_10:
-        for row in d['messages']:
-            yield '\n'.join(n['content'] for n in row)
-    for row in dataset_11:
-        yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
 special_tokens = [
     '<s>',
     '</s>',
-    '<pad>',
-    '<unk>',
-    '<mask>',
-    '<|im_start|>',
     '<|im_end|>',
     '<tools>',
     '</tools>',
     '<tool_call>',
@@ -101,32 +151,43 @@ for i in range(2, 25):
 for i in range(64 - len(special_tokens)):
     special_tokens.append(f'<|reserved_{i}|>')
-ascii_chars = string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation
-#
-# tokenizer
-#
-tokenizer = Tokenizer(models.BPE())
-# set up pre-tokenizer to split on whitespace and punctuation
-tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
-    pre_tokenizers.WhitespaceSplit(),
-    pre_tokenizers.Punctuation(),
 ])
-# trainer
-trainer = trainers.BpeTrainer(
     vocab_size=32064,
     special_tokens=special_tokens,
-    initial_alphabet=list(ascii_chars),
 )
-# train the tokenizer
-tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
-#
-# fast_tokenizer
-#
 CHATML_CHAT_TEMPLATE = (
     "{% for message in messages %}"
         "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
@@ -140,10 +201,12 @@ fast_tokenizer = PreTrainedTokenizerFast(
     tokenizer_object=tokenizer,
     chat_template=CHATML_CHAT_TEMPLATE,
     bos_token='<s>',
-    eos_token='</s>',
     unk_token='<unk>',
-    pad_token='<pad>',
-    mask_token='<mask>',
 )
-fast_tokenizer.save_pretrained('..')

+import gc
 import string
 from datasets import load_dataset
 from transformers import PreTrainedTokenizerFast
+from tokenizers import Tokenizer, normalizers, decoders
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.processors import TemplateProcessing
 def batch_iterator():
+    # yield 'На вр брда врба мрда.\nНема струје због олује.'
+    # return
+    # dataset_0 = (
+    #     load_dataset('wikimedia/wikisource', lang, split='train')
+    #     for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
+    # )
+    #
     # for d in dataset_0:
     #     for row in d['text']:
     #         yield row
+    #
+    # del dataset_0
+    # gc.collect()
+    dataset_1 = (
+        load_dataset('xu-song/cc100-samples', lang, split='train')
+        for lang in ['am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu']
+    )
     for d in dataset_1:
         for row in d['text']:
             yield row
+    del dataset_1
+    gc.collect()
+    # dataset_2 = (
+    #     load_dataset('csebuetnlp/xlsum', lang, split='train')
+    #     for lang in ['amharic', 'arabic', 'azerbaijani', 'bengali', 'burmese', 'chinese_simplified', 'chinese_traditional', 'english', 'french', 'gujarati', 'hausa', 'hindi', 'igbo', 'indonesian', 'japanese', 'kirundi', 'korean', 'kyrgyz', 'marathi', 'nepali', 'oromo', 'pashto', 'persian', 'pidgin', 'portuguese', 'punjabi', 'russian', 'scottish_gaelic', 'serbian_cyrillic', 'serbian_latin', 'sinhala', 'somali', 'spanish', 'swahili', 'tamil', 'telugu', 'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek', 'vietnamese', 'welsh', 'yoruba']
+    # )
+    #
+    # for d in dataset_2:
+    #     for row in d['text']:
+    #         yield row
+    #
+    # del dataset_2
+    # gc.collect()
+    # dataset_3 = load_dataset('recursal/SuperWikiNEXT-32B', split='train')
+    #
     # for row in dataset_3['text']:
     #     yield row
+    #
+    # del dataset_3
+    # gc.collect()
+    dataset_4 = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train')
     for row in dataset_4:
         yield row['query'] + '\n' + row['answer']
+    del dataset_4
+    gc.collect()
+    # dataset_5 = load_dataset('nampdn-ai/tiny-codes', split='train')
+    #
+    # for row in dataset_5:
+    #     yield row['prompt'] + '\n' + row['response']
+    #
+    # del dataset_5
+    # gc.collect()
+    # dataset_6 = load_dataset('ajibawa-2023/Maths-College', split='train')
+    #
     # for row in dataset_6:
     #     yield row['instruction'] + '\n' + row['output']
+    #
+    # del dataset_6
+    # gc.collect()
+    dataset_7 = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
     for row in dataset_7:
         yield row['question'] + '\n' + row['answer']
+    del dataset_7
+    gc.collect()
+    dataset_8 = load_dataset('mlabonne/FineTome-100k', split='train')
+    for row in dataset_8['conversations']:
+        yield '\n'.join(n['value'] for n in row)
+    del dataset_8
+    gc.collect()
+    # dataset_9 = load_dataset('arcee-ai/agent-data', split='train')
+    #
+    # for row in dataset_9['conversations']:
+    #     yield '\n'.join(n['value'] for n in row)
+    #
+    # del dataset_9
+    # gc.collect()
+    # dataset_10 = (
+    #     load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_filtered.jsonl', split='train'),
+    #     load_dataset('cognitivecomputations/SystemChat-2.0', data_files='SystemChat_multilingual.jsonl', split='train'),
+    # )
+    #
+    # for d in dataset_10:
+    #     for row in d['messages']:
+    #         yield '\n'.join(n['content'] for n in row)
+    #
+    # del dataset_10
+    # gc.collect()
+    # dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
+    #
+    # for row in dataset_11:
+    #     yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
+    #
+    # del dataset_11
+    # gc.collect()
+bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
+tokenizer = Tokenizer(bpe)
 special_tokens = [
+    '<unk>',
     '<s>',
     '</s>',
     '<|im_end|>',
+    '<|im_start|>',
     '<tools>',
     '</tools>',
     '<tool_call>',
 for i in range(64 - len(special_tokens)):
     special_tokens.append(f'<|reserved_{i}|>')
+tokenizer.add_special_tokens(special_tokens)
+ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
+dataset_11 = load_dataset('badrex/llm-emoji-dataset', split='train')
+emoji_chars = [row['character'] for row in dataset_11 if len(row['character']) == 1]
+del dataset_11
+tokenizer.normalizer = normalizers.Sequence([
+    normalizers.Prepend("▁"),
+    normalizers.Replace(" ", "▁"),
+])
+tokenizer.decoder = decoders.Sequence([
+    decoders.Replace("▁", " "),
+    decoders.ByteFallback(),
+    decoders.Fuse(),
+    decoders.Strip(' ', 1, 0),
 ])
+tokenizer.post_processor = TemplateProcessing(
+    single='$A:0',      # $A represents the token, :0 specifies the type ID for single sequences
+    pair='$A:0 $B:1',   # For pairs, we specify type IDs for both tokens
+    special_tokens=[],
+)
+trainer = BpeTrainer(
     vocab_size=32064,
+    min_frequency=2,
     special_tokens=special_tokens,
+    initial_alphabet=ascii_chars + emoji_chars,
 )
+tokenizer.train_from_iterator(batch_iterator(), trainer)
+tokenizer.save('../tokenizer.json')
+tokenizer.model.save('../')
 CHATML_CHAT_TEMPLATE = (
     "{% for message in messages %}"
         "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
     tokenizer_object=tokenizer,
     chat_template=CHATML_CHAT_TEMPLATE,
     bos_token='<s>',
+    eos_token='<|im_end|>',
     unk_token='<unk>',
+    pad_token='</s>',
+    clean_up_tokenization_spaces=False,
+    spaces_between_special_tokens=False,
+    use_default_system_prompt=False,
 )
+fast_tokenizer.save_pretrained('../')

special_tokens_map.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "mask_token": "<mask>",
-  "pad_token": "<pad>",
-  "unk_token": "<unk>"
-}

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json DELETED Viewed

@@ -1,525 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "<mask>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5": {
-      "content": "<|im_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "7": {
-      "content": "<tools>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "8": {
-      "content": "</tools>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "9": {
-      "content": "<tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "10": {
-      "content": "</tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "11": {
-      "content": "<tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "12": {
-      "content": "</tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "13": {
-      "content": "system",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "14": {
-      "content": "user",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "15": {
-      "content": "assistant",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "16": {
-      "content": "  ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "17": {
-      "content": "   ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "18": {
-      "content": "    ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "19": {
-      "content": "     ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "20": {
-      "content": "      ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "21": {
-      "content": "       ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "22": {
-      "content": "        ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "23": {
-      "content": "         ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "24": {
-      "content": "          ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "25": {
-      "content": "           ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "26": {
-      "content": "            ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "27": {
-      "content": "             ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "28": {
-      "content": "              ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "29": {
-      "content": "               ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "30": {
-      "content": "                ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "31": {
-      "content": "                 ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "32": {
-      "content": "                  ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "33": {
-      "content": "                   ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "34": {
-      "content": "                    ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "35": {
-      "content": "                     ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "36": {
-      "content": "                      ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "37": {
-      "content": "                       ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "38": {
-      "content": "                        ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "39": {
-      "content": "<|reserved_0|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "40": {
-      "content": "<|reserved_1|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "41": {
-      "content": "<|reserved_2|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "42": {
-      "content": "<|reserved_3|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "43": {
-      "content": "<|reserved_4|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "44": {
-      "content": "<|reserved_5|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "45": {
-      "content": "<|reserved_6|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "46": {
-      "content": "<|reserved_7|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "47": {
-      "content": "<|reserved_8|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "48": {
-      "content": "<|reserved_9|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "49": {
-      "content": "<|reserved_10|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "50": {
-      "content": "<|reserved_11|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "51": {
-      "content": "<|reserved_12|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "52": {
-      "content": "<|reserved_13|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "53": {
-      "content": "<|reserved_14|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "54": {
-      "content": "<|reserved_15|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "55": {
-      "content": "<|reserved_16|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "56": {
-      "content": "<|reserved_17|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "57": {
-      "content": "<|reserved_18|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "58": {
-      "content": "<|reserved_19|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "59": {
-      "content": "<|reserved_20|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "60": {
-      "content": "<|reserved_21|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "61": {
-      "content": "<|reserved_22|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "62": {
-      "content": "<|reserved_23|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "63": {
-      "content": "<|reserved_24|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-  "clean_up_tokenization_spaces": true,
-  "eos_token": "</s>",
-  "mask_token": "<mask>",
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<pad>",
-  "tokenizer_class": "PreTrainedTokenizerFast",
-  "unk_token": "<unk>"
-}