from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf") | |
# 1. Remove the "</s>" token from the vocabulary | |
vocab = tokenizer.get_vocab() | |
del vocab['</s>'] | |
vocab['<|im_end|>'] = 2 | |
tokenizer = AutoTokenizer.from_pretrained( | |
"/workspace/dolphin-2.6-mistral-7b-hf", | |
vocab=vocab | |
) | |
tokenizer.eos_token = "<|im_end|>" | |
tokenizer.pad_token = "<|im_end|>" | |
# 5. Save the modified tokenizer | |
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/') |