speed commited on
Commit
efc4887
·
verified ·
1 Parent(s): 1d7b38c

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -4
  2. tokenizer.json +33 -12
  3. tokenizer_config.json +10 -12
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "cls_token": {
10
- "content": "<CLS|LLM-jp>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
@@ -21,21 +21,21 @@
21
  "single_word": false
22
  },
23
  "mask_token": {
24
- "content": "<MASK|LLM-jp>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
- "content": "<PAD|LLM-jp>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "<SEP|LLM-jp>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "cls_token": {
10
+ "content": "<s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
 
21
  "single_word": false
22
  },
23
  "mask_token": {
24
+ "content": "<MASK>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
+ "content": "<PAD>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "</s>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
tokenizer.json CHANGED
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "<MASK|LLM-jp>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "<PAD|LLM-jp>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 5,
53
- "content": "<CLS|LLM-jp>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -59,7 +59,7 @@
59
  },
60
  {
61
  "id": 6,
62
- "content": "<SEP|LLM-jp>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
@@ -68,7 +68,7 @@
68
  },
69
  {
70
  "id": 7,
71
- "content": "<EOD|LLM-jp>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
@@ -110,6 +110,12 @@
110
  "id": "A",
111
  "type_id": 0
112
  }
 
 
 
 
 
 
113
  }
114
  ],
115
  "pair": [
@@ -127,8 +133,8 @@
127
  },
128
  {
129
  "SpecialToken": {
130
- "id": "<s>",
131
- "type_id": 1
132
  }
133
  },
134
  {
@@ -136,9 +142,24 @@
136
  "id": "B",
137
  "type_id": 1
138
  }
 
 
 
 
 
 
139
  }
140
  ],
141
  "special_tokens": {
 
 
 
 
 
 
 
 
 
142
  "<s>": {
143
  "id": "<s>",
144
  "ids": [
@@ -192,23 +213,23 @@
192
  -127.5
193
  ],
194
  [
195
- "<MASK|LLM-jp>",
196
  -127.5
197
  ],
198
  [
199
- "<PAD|LLM-jp>",
200
  -127.5
201
  ],
202
  [
203
- "<CLS|LLM-jp>",
204
  -127.5
205
  ],
206
  [
207
- "<SEP|LLM-jp>",
208
  -127.5
209
  ],
210
  [
211
- "<EOD|LLM-jp>",
212
  -127.5
213
  ],
214
  [
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "<MASK>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "<PAD>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
50
  },
51
  {
52
  "id": 5,
53
+ "content": "<CLS>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
59
  },
60
  {
61
  "id": 6,
62
+ "content": "<SEP>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
 
68
  },
69
  {
70
  "id": 7,
71
+ "content": "<EOD>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
 
110
  "id": "A",
111
  "type_id": 0
112
  }
113
+ },
114
+ {
115
+ "SpecialToken": {
116
+ "id": "</s>",
117
+ "type_id": 0
118
+ }
119
  }
120
  ],
121
  "pair": [
 
133
  },
134
  {
135
  "SpecialToken": {
136
+ "id": "</s>",
137
+ "type_id": 0
138
  }
139
  },
140
  {
 
142
  "id": "B",
143
  "type_id": 1
144
  }
145
+ },
146
+ {
147
+ "SpecialToken": {
148
+ "id": "<s>",
149
+ "type_id": 1
150
+ }
151
  }
152
  ],
153
  "special_tokens": {
154
+ "</s>": {
155
+ "id": "</s>",
156
+ "ids": [
157
+ 2
158
+ ],
159
+ "tokens": [
160
+ "</s>"
161
+ ]
162
+ },
163
  "<s>": {
164
  "id": "<s>",
165
  "ids": [
 
213
  -127.5
214
  ],
215
  [
216
+ "<MASK>",
217
  -127.5
218
  ],
219
  [
220
+ "<PAD>",
221
  -127.5
222
  ],
223
  [
224
+ "<CLS>",
225
  -127.5
226
  ],
227
  [
228
+ "<SEP>",
229
  -127.5
230
  ],
231
  [
232
+ "<EOD>",
233
  -127.5
234
  ],
235
  [
tokenizer_config.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
@@ -27,7 +25,7 @@
27
  "special": true
28
  },
29
  "3": {
30
- "content": "<MASK|LLM-jp>",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
@@ -35,7 +33,7 @@
35
  "special": true
36
  },
37
  "4": {
38
- "content": "<PAD|LLM-jp>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
@@ -43,7 +41,7 @@
43
  "special": true
44
  },
45
  "5": {
46
- "content": "<CLS|LLM-jp>",
47
  "lstrip": false,
48
  "normalized": false,
49
  "rstrip": false,
@@ -51,7 +49,7 @@
51
  "special": true
52
  },
53
  "6": {
54
- "content": "<SEP|LLM-jp>",
55
  "lstrip": false,
56
  "normalized": false,
57
  "rstrip": false,
@@ -59,7 +57,7 @@
59
  "special": true
60
  },
61
  "7": {
62
- "content": "<EOD|LLM-jp>",
63
  "lstrip": false,
64
  "normalized": false,
65
  "rstrip": false,
@@ -69,14 +67,14 @@
69
  },
70
  "bos_token": "<s>",
71
  "clean_up_tokenization_spaces": false,
72
- "cls_token": "<CLS|LLM-jp>",
73
- "eod_token": "</s>",
74
  "eos_token": "</s>",
75
  "extra_ids": 0,
76
- "mask_token": "<MASK|LLM-jp>",
77
  "model_max_length": 1000000000000000019884624838656,
78
- "pad_token": "<PAD|LLM-jp>",
79
- "sep_token": "<SEP|LLM-jp>",
80
  "sp_model_kwargs": {},
81
  "tokenizer_class": "PreTrainedTokenizerFast",
82
  "unk_token": "<unk>"
 
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "<MASK>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "<PAD>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  },
43
  "5": {
44
+ "content": "<CLS>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
 
49
  "special": true
50
  },
51
  "6": {
52
+ "content": "<SEP>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
 
57
  "special": true
58
  },
59
  "7": {
60
+ "content": "<EOD>",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
 
67
  },
68
  "bos_token": "<s>",
69
  "clean_up_tokenization_spaces": false,
70
+ "cls_token": "<s>",
71
+ "eod_token": "<EOD>",
72
  "eos_token": "</s>",
73
  "extra_ids": 0,
74
+ "mask_token": "<MASK>",
75
  "model_max_length": 1000000000000000019884624838656,
76
+ "pad_token": "<PAD>",
77
+ "sep_token": "</s>",
78
  "sp_model_kwargs": {},
79
  "tokenizer_class": "PreTrainedTokenizerFast",
80
  "unk_token": "<unk>"