File size: 517 Bytes
4b6b09c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf")

# 1. Remove the "</s>" token from the vocabulary
vocab = tokenizer.get_vocab()
del vocab['</s>']
vocab['<|im_end|>'] = 2

tokenizer = AutoTokenizer.from_pretrained(
    "/workspace/dolphin-2.6-mistral-7b-hf",
    vocab=vocab
)

tokenizer.eos_token = "<|im_end|>"
tokenizer.pad_token = "<|im_end|>"

# 5. Save the modified tokenizer
tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/')