Upload folder using huggingface_hub
Browse filesxDAN L3 100B : xDAN-APUS4-MoE-v3.1-0410
- check.py +67 -0
- config.json +34 -0
- model-00001-of-00024.safetensors +3 -0
- model-00002-of-00024.safetensors +3 -0
- model-00003-of-00024.safetensors +3 -0
- model-00004-of-00024.safetensors +3 -0
- model-00005-of-00024.safetensors +3 -0
- model-00006-of-00024.safetensors +3 -0
- model-00007-of-00024.safetensors +3 -0
- model-00008-of-00024.safetensors +3 -0
- model-00009-of-00024.safetensors +3 -0
- model-00010-of-00024.safetensors +3 -0
- model-00011-of-00024.safetensors +3 -0
- model-00012-of-00024.safetensors +3 -0
- model-00013-of-00024.safetensors +3 -0
- model-00014-of-00024.safetensors +3 -0
- model-00015-of-00024.safetensors +3 -0
- model-00016-of-00024.safetensors +3 -0
- model-00017-of-00024.safetensors +3 -0
- model-00018-of-00024.safetensors +3 -0
- model-00019-of-00024.safetensors +3 -0
- model-00020-of-00024.safetensors +3 -0
- model-00021-of-00024.safetensors +3 -0
- model-00022-of-00024.safetensors +3 -0
- model-00023-of-00024.safetensors +3 -0
- model-00024-of-00024.safetensors +3 -0
- model.safetensors.index.json +0 -0
- save_tokenizer/added_tokens.json +4 -0
- save_tokenizer/special_tokens_map.json +23 -0
- save_tokenizer/tokenizer.model +3 -0
- save_tokenizer/tokenizer_config.json +42 -0
- special_tokens_map.json +27 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +65 -0
check.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
from transformers import LlamaTokenizer
|
4 |
+
import sentencepiece as spm
|
5 |
+
|
6 |
+
# 设置环境变量
|
7 |
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
8 |
+
|
9 |
+
def update_tokenizer(set_chatml, output_hf_dir="./tokenizer.model"):
|
10 |
+
# 从预训练路径重新加载tokenizer
|
11 |
+
custom_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
|
12 |
+
|
13 |
+
# 如果启用了set_chatml,更新bos_token和eos_token
|
14 |
+
|
15 |
+
print(f"当前bos标记: (id: {custom_tokenizer.bos_token})")
|
16 |
+
print(f"当前eos标记: (id: {custom_tokenizer.eos_token})")
|
17 |
+
# 统计总词汇量
|
18 |
+
vocab_size = len(custom_tokenizer)
|
19 |
+
print(f"总词汇量: {vocab_size}")
|
20 |
+
|
21 |
+
|
22 |
+
# 根据 --set_chatml 参数设置 bos_token 和 eos_token
|
23 |
+
if set_chatml:
|
24 |
+
print("设置Chatml格式为EOS/BOS")
|
25 |
+
custom_tokenizer.bos_token = "<|im_start|>"
|
26 |
+
custom_tokenizer.eos_token = "<|im_end|>"
|
27 |
+
print("BOS and EOS tokens set to '<|im_start|>' and '<|im_end|>' for ChatML.")
|
28 |
+
else:
|
29 |
+
print(f"Default BOS token: {custom_tokenizer.bos_token}, EOS token: {custom_tokenizer.eos_token}")
|
30 |
+
|
31 |
+
|
32 |
+
startoftext_id = custom_tokenizer.convert_tokens_to_ids('<|startoftext|>')
|
33 |
+
endoftext_id = custom_tokenizer.convert_tokens_to_ids('<|endoftext|>')
|
34 |
+
|
35 |
+
print(f"当前 <|startoftext|> 索引: {startoftext_id}")
|
36 |
+
print(f"当前 <|endoftext|> 索引: {endoftext_id}")
|
37 |
+
|
38 |
+
|
39 |
+
startofs_id = custom_tokenizer.convert_tokens_to_ids('<s>')
|
40 |
+
endofs_id = custom_tokenizer.convert_tokens_to_ids('</s>')
|
41 |
+
|
42 |
+
print(f"当前 <s> 索引: {startofs_id}")
|
43 |
+
print(f"当前 </s>索引: {endofs_id}")
|
44 |
+
|
45 |
+
# 统计bos和eos标记及其数量
|
46 |
+
bos_id = custom_tokenizer.bos_token_id
|
47 |
+
eos_id = custom_tokenizer.eos_token_id
|
48 |
+
bos_token = custom_tokenizer.convert_ids_to_tokens(bos_id)
|
49 |
+
eos_token = custom_tokenizer.convert_ids_to_tokens(eos_id)
|
50 |
+
print(f"最终bos标记: {bos_token} (id: {custom_tokenizer.bos_token})")
|
51 |
+
print(f"最终eos标记: {eos_token} (id: {custom_tokenizer.eos_token})")
|
52 |
+
|
53 |
+
print("all_special_tokens: \n\n",custom_tokenizer.all_special_tokens)
|
54 |
+
print("all_special_ids: \n\n",custom_tokenizer.all_special_ids)
|
55 |
+
print("special_tokens_map: \n\n",custom_tokenizer.special_tokens_map)
|
56 |
+
output_hf_dir ="./save_tokenizer"
|
57 |
+
custom_tokenizer.save_pretrained(output_hf_dir)
|
58 |
+
print(f" Tokenizer has been saved to {output_hf_dir}")
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
parser = argparse.ArgumentParser(description="更新Tokenizer,根据需要设置BOS/EOS为ChatML格式")
|
62 |
+
parser.add_argument("--set_chatml", action="store_true", help="如果设置,将BOS和EOS标记用于ChatML。")
|
63 |
+
args = parser.parse_args()
|
64 |
+
|
65 |
+
# 根据命令行参数更新tokenizer
|
66 |
+
update_tokenizer(args.set_chatml)
|
67 |
+
|
config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "xDAN2099/xDAN-L2-RL-Chat-Base-v1.2",
|
3 |
+
"architectures": [
|
4 |
+
"MixtralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 6,
|
9 |
+
"eos_token_id": 7,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 7168,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 20480,
|
14 |
+
"max_position_embeddings": 32768,
|
15 |
+
"model_type": "mixtral",
|
16 |
+
"num_attention_heads": 56,
|
17 |
+
"num_experts_per_tok": 2,
|
18 |
+
"num_hidden_layers": 60,
|
19 |
+
"num_key_value_heads": 8,
|
20 |
+
"num_local_experts": 4,
|
21 |
+
"output_router_logits": false,
|
22 |
+
"pad_token_id": 0,
|
23 |
+
"pretraining_tp": 1,
|
24 |
+
"rms_norm_eps": 1e-05,
|
25 |
+
"rope_scaling": null,
|
26 |
+
"rope_theta": 10000000.0,
|
27 |
+
"router_aux_loss_coef": 0.001,
|
28 |
+
"sliding_window": null,
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"torch_dtype": "bfloat16",
|
31 |
+
"transformers_version": "4.39.2",
|
32 |
+
"use_cache": false,
|
33 |
+
"vocab_size": 64000
|
34 |
+
}
|
model-00001-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e7f30c6f4cfcbd40950421fc33fde3a72840bdc88539121c1ba1627e63a971e
|
3 |
+
size 9879789120
|
model-00002-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbbaea93743fd6c576eaad3224015d8bdfee7b03368b89e7583b658befc12ad2
|
3 |
+
size 9865065968
|
model-00003-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad4c01858a055cc7960995778cc71925d3dc94599492a0ad0ef47f6a229594b4
|
3 |
+
size 9806374952
|
model-00004-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75408df210a2387b5ca567940b458d59798a17989eb382f3e334b91bfc856feb
|
3 |
+
size 9997201336
|
model-00005-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ada54dac2b9de4f5947af329d2077e20a29ec5c68764e9a11abff7218fd3f1a
|
3 |
+
size 9967841040
|
model-00006-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:482cf8bcd637dc6657337a14c62e64aceed3b1c7cafe7fd460af39e117f0d0e6
|
3 |
+
size 9806375000
|
model-00007-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0159ffe8ac1c41f361dc155512870276f9d49441b5fb2fe5ce2747d63b7b358a
|
3 |
+
size 9865066016
|
model-00008-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d09f79b3f16853164d224cce9f9c653acca3772888ca881bce46fb5c0144c8e
|
3 |
+
size 9806375000
|
model-00009-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a76dde992ac9fff0872fb85e30cd2b3780ebf30f1d7296147c7260609160a8a
|
3 |
+
size 9806375000
|
model-00010-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bf5e1663e4c5ae5bd4e2fda2361e9ac837e5ad13d28be1eaaf68fae732f8d72
|
3 |
+
size 9865066016
|
model-00011-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca16eb813b303abf74b01b310707cd9127945ffd6ba1583ffe54c1d90d8d1da8
|
3 |
+
size 9806375000
|
model-00012-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f35c0721bd26c13dfe69144c2e4d9312f6d47fc1bcdd952367c00443ca8d0e9
|
3 |
+
size 9997201392
|
model-00013-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e895d9257b9903c04e7cd18c4e325addd8e1b7a99c92380deb34c19cc40757e7
|
3 |
+
size 9967841040
|
model-00014-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e607969e5a2d24eeb17189810c0a498d1df9db1a9ce117b6705691cdd61ed78
|
3 |
+
size 9806375000
|
model-00015-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47c9dd3508df9cfdb3b84226814c184853df94b281282aec9724147da62aa4c9
|
3 |
+
size 9865066016
|
model-00016-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a941fb43fac5cc2cd089dae7dafbb6ed99d7a6851878b664914cbcec0037f0d7
|
3 |
+
size 9806375000
|
model-00017-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:541a1009d2097d35995921abcbab4a336c027d85b7ea7ba003d47f18c0b92565
|
3 |
+
size 9806375000
|
model-00018-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0fd2cd18bd17bb902929954b6151404fbda56710a03980ce482a54d069d5b0c
|
3 |
+
size 9865066016
|
model-00019-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbfd0c19d0b26d3ffabbecfc96e1a0a9d2d6843f956b92530addd9253f891c67
|
3 |
+
size 9806375000
|
model-00020-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7c37309b1d4372ac6791eabc7ddbb018ba9865b32b318bb2901c4b959510286
|
3 |
+
size 9997201392
|
model-00021-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e6bb8c4310d4bf19a6e91c467b694679b16f4b8bac0897a632a383e71645cf0
|
3 |
+
size 9967841040
|
model-00022-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acd802426f28663efbdff50c5d0198e5493fcbc2f2b469e441e35b75c3c69e1a
|
3 |
+
size 9806375000
|
model-00023-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93d93d5686d2b06c1ae8e5130ac68dfb93b742bd2529155633d0bba1f9319253
|
3 |
+
size 9865066016
|
model-00024-of-00024.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4f709f13dfe9c5f17a223874b58feab51f88041cd1bf81b3abea7280a0528e3
|
3 |
+
size 297048896
|
model.safetensors.index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
save_tokenizer/added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</s>": 64001,
|
3 |
+
"<s>": 64000
|
4 |
+
}
|
save_tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"unk_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
save_tokenizer/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
|
3 |
+
size 1033105
|
save_tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": true,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"64000": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"64001": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"clean_up_tokenization_spaces": false,
|
33 |
+
"eos_token": "</s>",
|
34 |
+
"legacy": true,
|
35 |
+
"model_max_length": 1000000000000000019884624838656,
|
36 |
+
"pad_token": null,
|
37 |
+
"sp_model_kwargs": {},
|
38 |
+
"spaces_between_special_tokens": false,
|
39 |
+
"tokenizer_class": "LlamaTokenizer",
|
40 |
+
"unk_token": "<unk>",
|
41 |
+
"use_default_system_prompt": false
|
42 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>"
|
4 |
+
],
|
5 |
+
"bos_token": {
|
6 |
+
"content": "<|im_start|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"eos_token": {
|
13 |
+
"content": "<|im_end|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false
|
18 |
+
},
|
19 |
+
"pad_token": "<|im_start|>",
|
20 |
+
"unk_token": {
|
21 |
+
"content": "<unk>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false
|
26 |
+
}
|
27 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
|
3 |
+
size 1033105
|
tokenizer_config.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": true,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<|startoftext|>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "<|endoftext|>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"6": {
|
31 |
+
"content": "<|im_start|>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
},
|
38 |
+
"7": {
|
39 |
+
"content": "<|im_end|>",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": false,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": true
|
45 |
+
}
|
46 |
+
},
|
47 |
+
"additional_special_tokens": [
|
48 |
+
"<|im_start|>"
|
49 |
+
],
|
50 |
+
"bos_token": "<|im_start|>",
|
51 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}",
|
52 |
+
"clean_up_tokenization_spaces": false,
|
53 |
+
"eos_token": "<|im_end|>",
|
54 |
+
"legacy": true,
|
55 |
+
"model_max_length": 200000,
|
56 |
+
"pad_token": "<|im_start|>",
|
57 |
+
"padding_side": "left",
|
58 |
+
"sp_model_kwargs": {},
|
59 |
+
"spaces_between_special_tokens": false,
|
60 |
+
"split_special_tokens": false,
|
61 |
+
"tokenizer_class": "LlamaTokenizer",
|
62 |
+
"unk_token": "<unk>",
|
63 |
+
"use_default_system_prompt": false,
|
64 |
+
"use_fast": true
|
65 |
+
}
|