empgces commited on
Commit
c6b4fb6
1 Parent(s): 24272bf

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -2
  2. tokenizer.json +29 -100
  3. tokenizer_config.json +4 -92
special_tokens_map.json CHANGED
@@ -7,14 +7,14 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|endoftext|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<|placeholder6|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<unk>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -26,108 +26,9 @@
26
  "content": "</s>",
27
  "single_word": false,
28
  "lstrip": false,
29
- "rstrip": true,
30
- "normalized": false,
31
- "special": false
32
- },
33
- {
34
- "id": 32000,
35
- "content": "<|endoftext|>",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 32001,
44
- "content": "<|assistant|>",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": true,
48
- "normalized": false,
49
- "special": true
50
- },
51
- {
52
- "id": 32002,
53
- "content": "<|placeholder1|>",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": true,
57
- "normalized": false,
58
- "special": true
59
- },
60
- {
61
- "id": 32003,
62
- "content": "<|placeholder2|>",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": true,
66
- "normalized": false,
67
- "special": true
68
- },
69
- {
70
- "id": 32004,
71
- "content": "<|placeholder3|>",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": true,
75
- "normalized": false,
76
- "special": true
77
- },
78
- {
79
- "id": 32005,
80
- "content": "<|placeholder4|>",
81
- "single_word": false,
82
- "lstrip": false,
83
- "rstrip": true,
84
- "normalized": false,
85
- "special": true
86
- },
87
- {
88
- "id": 32006,
89
- "content": "<|system|>",
90
- "single_word": false,
91
- "lstrip": false,
92
- "rstrip": true,
93
- "normalized": false,
94
- "special": true
95
- },
96
- {
97
- "id": 32007,
98
- "content": "<|end|>",
99
- "single_word": false,
100
- "lstrip": false,
101
- "rstrip": true,
102
- "normalized": false,
103
- "special": true
104
- },
105
- {
106
- "id": 32008,
107
- "content": "<|placeholder5|>",
108
- "single_word": false,
109
- "lstrip": false,
110
- "rstrip": true,
111
- "normalized": false,
112
- "special": true
113
- },
114
- {
115
- "id": 32009,
116
- "content": "<|placeholder6|>",
117
- "single_word": false,
118
- "lstrip": false,
119
  "rstrip": false,
120
  "normalized": false,
121
  "special": true
122
- },
123
- {
124
- "id": 32010,
125
- "content": "<|user|>",
126
- "single_word": false,
127
- "lstrip": false,
128
- "rstrip": true,
129
- "normalized": false,
130
- "special": true
131
  }
132
  ],
133
  "normalizer": {
@@ -150,6 +51,12 @@
150
  "post_processor": {
151
  "type": "TemplateProcessing",
152
  "single": [
 
 
 
 
 
 
153
  {
154
  "Sequence": {
155
  "id": "A",
@@ -158,12 +65,24 @@
158
  }
159
  ],
160
  "pair": [
 
 
 
 
 
 
161
  {
162
  "Sequence": {
163
  "id": "A",
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
@@ -171,7 +90,17 @@
171
  }
172
  }
173
  ],
174
- "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
 
26
  "content": "</s>",
27
  "single_word": false,
28
  "lstrip": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "normalizer": {
 
51
  "post_processor": {
52
  "type": "TemplateProcessing",
53
  "single": [
54
+ {
55
+ "SpecialToken": {
56
+ "id": "<s>",
57
+ "type_id": 0
58
+ }
59
+ },
60
  {
61
  "Sequence": {
62
  "id": "A",
 
65
  }
66
  ],
67
  "pair": [
68
+ {
69
+ "SpecialToken": {
70
+ "id": "<s>",
71
+ "type_id": 0
72
+ }
73
+ },
74
  {
75
  "Sequence": {
76
  "id": "A",
77
  "type_id": 0
78
  }
79
  },
80
+ {
81
+ "SpecialToken": {
82
+ "id": "<s>",
83
+ "type_id": 1
84
+ }
85
+ },
86
  {
87
  "Sequence": {
88
  "id": "B",
 
90
  }
91
  }
92
  ],
93
+ "special_tokens": {
94
+ "<s>": {
95
+ "id": "<s>",
96
+ "ids": [
97
+ 1
98
+ ],
99
+ "tokens": [
100
+ "<s>"
101
+ ]
102
+ }
103
+ }
104
  },
105
  "decoder": {
106
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "add_bos_token": false,
3
  "add_eos_token": false,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
@@ -23,106 +23,18 @@
23
  "content": "</s>",
24
  "lstrip": false,
25
  "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
  "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
  }
118
  },
119
  "bos_token": "<s>",
120
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
- "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
  "model_max_length": 4096,
125
- "pad_token": "<|placeholder6|>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",
 
1
  {
2
+ "add_bos_token": true,
3
  "add_eos_token": false,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
 
23
  "content": "</s>",
24
  "lstrip": false,
25
  "normalized": false,
 
 
 
 
 
 
 
 
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
  },
31
  "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
  "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
  "legacy": false,
36
  "model_max_length": 4096,
37
+ "pad_token": "<unk>",
38
  "padding_side": "left",
39
  "sp_model_kwargs": {},
40
  "tokenizer_class": "LlamaTokenizer",