Yuto Oikawa commited on
Commit
2674b45
1 Parent(s): a1957f9

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer_config.json +22 -0
  3. vocab.txt +0 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "do_nfkc": false,
5
+ "do_subword_tokenize": true,
6
+ "do_word_tokenize": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "name_or_path": "megagonlabs/transformers-ud-japanese-electra-base-discriminator",
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "special_tokens_map_file": "/root/.cache/huggingface/hub/models--megagonlabs--transformers-ud-japanese-electra-base-discriminator/snapshots/96a3711b754c2caf0d1e22b30cbb893a37fa46c2/special_tokens_map.json",
13
+ "subword_tokenizer_type": "wordpiece",
14
+ "sudachipy_kwargs": {
15
+ "dict_type": "core",
16
+ "split_mode": "A"
17
+ },
18
+ "tokenizer_class": "ElectraSudachipyTokenizer",
19
+ "unk_token": "[UNK]",
20
+ "word_form_type": "dictionary_and_surface",
21
+ "word_tokenizer_type": "sudachipy"
22
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff