|
from transformers import PreTrainedTokenizerFast |
|
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders |
|
from tokenizers.models import BPE |
|
from tokenizers.trainers import BpeTrainer |
|
|
|
from utils import batch_text_iterator |
|
from pretrain_datasets import tokenizer_datasets |
|
|
|
|
|
|
|
|
|
bos_token = '<|begin_of_text|>' |
|
eos_token = '<|end_of_text|>' |
|
|
|
special_tokens = [ |
|
bos_token, |
|
eos_token, |
|
'<|start_header_id|>', |
|
'<|end_header_id|>', |
|
'<|eom_id|>', |
|
'<|eot_id|>', |
|
'system', |
|
'user', |
|
'assistant', |
|
|
|
|
|
'<tools>', |
|
'</tools>', |
|
'<tool>', |
|
'</tool>', |
|
'<tool_call>', |
|
'</tool_call>', |
|
'<tool_response>', |
|
'</tool_response>', |
|
'"name"', |
|
'"arguments"', |
|
|
|
|
|
|
|
|
|
|
|
'"$schema"', |
|
'"$id"', |
|
'"$ref"', |
|
'"$defs"', |
|
'"$anchor"', |
|
'"$dynamicAnchor"', |
|
'"$dynamicRef"', |
|
'"$vocabulary"', |
|
'"$comment"', |
|
|
|
'"null"', |
|
'"boolean"', |
|
'"object"', |
|
'"array"', |
|
'"number"', |
|
'"string"', |
|
'"integer"', |
|
|
|
'"type"', |
|
'"enum"', |
|
'"const"', |
|
'"multipleOf"', |
|
'"maximum"', |
|
'"exclusiveMaximum"', |
|
'"minimum"', |
|
'"exclusiveMinimum"', |
|
'"maxLength"', |
|
'"minLength"', |
|
'"pattern"', |
|
'"additionalItems"', |
|
'"items"', |
|
'"prefixItems"', |
|
'"contains"', |
|
'"maxItems"', |
|
'"minItems"', |
|
'"uniqueItems"', |
|
'"maxProperties"', |
|
'"minProperties"', |
|
'"required"', |
|
'"properties"', |
|
'"patternProperties"', |
|
'"additionalProperties"', |
|
'"dependentRequired"', |
|
'"dependentSchemas"', |
|
'"propertyNames"', |
|
|
|
'"if"', |
|
'"then"', |
|
'"else"', |
|
'"allOf"', |
|
'"anyOf"', |
|
'"oneOf"', |
|
'"not"', |
|
|
|
'"unevaluatedItems"', |
|
'"unevaluatedProperties"', |
|
|
|
'"title"', |
|
'"description"', |
|
'"default"', |
|
'"deprecated"', |
|
'"readOnly"', |
|
'"writeOnly"', |
|
'"examples"', |
|
|
|
'"contentEncoding"', |
|
'"contentMediaType"', |
|
'"contentSchema"', |
|
|
|
'"next"', |
|
'"value"', |
|
|
|
|
|
'<input>', |
|
'</input>', |
|
'<output>', |
|
'</output>', |
|
'<query>', |
|
'</query>', |
|
'<key>', |
|
'</key>', |
|
'<value>', |
|
'</value>', |
|
'<text>', |
|
'</text>', |
|
'<code>', |
|
'</code>', |
|
'<image>', |
|
'</image>', |
|
'<file>', |
|
'</file>', |
|
|
|
|
|
'<question>', |
|
'</question>', |
|
'<answer>', |
|
'</answer>', |
|
|
|
|
|
'<thought>', |
|
'</thought>', |
|
'<plan>', |
|
'</plan>', |
|
'<vote>', |
|
'</vote>', |
|
'<passage>', |
|
'</passage>', |
|
|
|
|
|
'<reasoning>', |
|
'</reasoning>', |
|
'<acting>', |
|
'</acting>', |
|
'<action>', |
|
'</action>', |
|
'<observation>', |
|
'</observation>', |
|
'<claim>', |
|
'</claim>', |
|
|
|
|
|
'<thinking>', |
|
'</thinking>', |
|
'<reflection>', |
|
'</reflection>', |
|
'<step>', |
|
'</step>', |
|
|
|
|
|
'<graph>', |
|
'</graph>', |
|
'<edge>', |
|
'</edge>', |
|
'<source>', |
|
'</source>', |
|
'<destination>', |
|
'</destination>', |
|
'<relation>', |
|
'</relation>', |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(256): |
|
special_tokens.append(f'<0x{i:02X}>') |
|
|
|
for i in range(64): |
|
special_tokens.append(f'<|reserved_special_token_{i}|>') |
|
|
|
|
|
|
|
|
|
bpe = BPE(unk_token=None, byte_fallback=True) |
|
tokenizer = Tokenizer(bpe) |
|
|
|
|
|
tokenizer.normalizer = None |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) |
|
|
|
|
|
tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True) |
|
|
|
|
|
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True) |
|
|
|
|
|
|
|
|
|
trainer = BpeTrainer( |
|
vocab_size=65536, |
|
min_frequency=3, |
|
special_tokens=special_tokens, |
|
max_token_length=24, |
|
) |
|
|
|
tokenizer.train_from_iterator( |
|
batch_text_iterator(tokenizer_datasets), |
|
trainer, |
|
) |
|
|
|
tokenizer.save('../tokenizer.json') |
|
tokenizer.model.save('../') |
|
|
|
|
|
|
|
|
|
CHAT_TEMPLATE = ( |
|
"{{ bos_token }}" |
|
|
|
"{% for message in messages %}" |
|
"{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}" |
|
"{% endfor %}" |
|
|
|
"{% if add_generation_prompt %}" |
|
"{{ '<|start_header_id|>assistant<|end_header_id|>' }}" |
|
"{% else %}" |
|
"{{ eos_token }}" |
|
"{% endif %}" |
|
) |
|
|
|
fast_tokenizer = PreTrainedTokenizerFast( |
|
tokenizer_object=tokenizer, |
|
chat_template=CHAT_TEMPLATE, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
clean_up_tokenization_spaces=False, |
|
) |
|
|
|
fast_tokenizer.save_pretrained('../') |
|
|