alvarodt commited on
Commit
1c4f6e3
·
1 Parent(s): 9808a14

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +7 -5
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +11 -11
  5. vocab.json +0 -0
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,7 +1,9 @@
1
  {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
 
 
7
  }
 
1
  {
2
+ "bos_token": "<|BOS|>",
3
+ "cls_token": "<|CLS|>",
4
+ "eos_token": "<|EOS|>",
5
+ "mask_token": "<|MASK|>",
6
+ "pad_token": "<|PAD|>",
7
+ "sep_token": "<|SEP|>",
8
+ "unk_token": "<|UNK|>"
9
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "cls_token": "[CLS]",
3
- "do_lower_case": true,
4
- "mask_token": "[MASK]",
5
- "model_max_length": 66.0,
6
- "name_or_path": "distilbert-base-uncased",
7
- "pad_token": "[PAD]",
8
- "sep_token": "[SEP]",
 
 
9
  "special_tokens_map_file": null,
10
- "strip_accents": null,
11
- "tokenize_chinese_chars": true,
12
- "tokenizer_class": "DistilBertTokenizer",
13
- "unk_token": "[UNK]"
14
  }
 
1
  {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|BOS|>",
4
+ "cls_token": "<|CLS|>",
5
+ "eos_token": "<|EOS|>",
6
+ "mask_token": "<|MASK|>",
7
+ "model_max_length": 1024,
8
+ "name_or_path": "gpt2",
9
+ "pad_token": "<|PAD|>",
10
+ "sep_token": "<|SEP|>",
11
  "special_tokens_map_file": null,
12
+ "tokenizer_class": "GPT2Tokenizer",
13
+ "unk_token": "<|UNK|>"
 
 
14
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff