hadidev commited on
Commit
ff35605
1 Parent(s): b727688

add tokenizer

Browse files
Files changed (4) hide show
  1. merges.txt +1 -1
  2. special_tokens_map.json +3 -5
  3. tokenizer_config.json +3 -29
  4. vocab.json +2 -2
merges.txt CHANGED
@@ -1,4 +1,4 @@
1
- #version: 0.2
2
  Ġ Ø
3
  Û Į
4
  Ø §
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
  Ġ Ø
3
  Û Į
4
  Ø §
special_tokens_map.json CHANGED
@@ -1,7 +1,5 @@
1
  {
2
- "bos_token": "<s>",
3
- "eos_token": "</s>",
4
- "mask_token": "<mask>",
5
- "pad_token": "<pad>",
6
- "unk_token": "<unk>"
7
  }
 
1
  {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
 
 
5
  }
tokenizer_config.json CHANGED
@@ -1,33 +1,7 @@
1
  {
2
- "add_bos_token": false,
3
  "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<|endoftext|>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "errors": "replace",
21
- "name_or_path": "hadidev/gpt2-urdu-tokenizer",
22
- "pad_token": null,
23
- "special_tokens_map_file": "/root/.cache/huggingface/transformers/fd05696fd7c524ed400d964c4d1fa66c6435bc0d588c55f7ac98c7c850c7cc5a.b7f8742f1d370b815979aeabc401aed45c79760724667d2725ac7503c242b97f",
24
  "tokenizer_class": "GPT2Tokenizer",
25
- "unk_token": {
26
- "__type": "AddedToken",
27
- "content": "<|endoftext|>",
28
- "lstrip": false,
29
- "normalized": true,
30
- "rstrip": false,
31
- "single_word": false
32
- }
33
  }
 
1
  {
 
2
  "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "tokenizer_class": "GPT2Tokenizer",
6
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
7
  }
vocab.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab6656bcf981ae51e1548a3879bf0ddc1167b1605dbaaccb71c7dcb5f8240e31
3
- size 10897434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e6a8e0b20ac6772046974b5e3f8998106d98de43dad56aa82f2552754875006
3
+ size 9733956