samikhan121 commited on
Commit
382ac5e
·
verified ·
1 Parent(s): 8f7c7ab

Upload tokenizer

Browse files
Files changed (5) hide show
  1. README.md +0 -1
  2. added_tokens.json +5 -0
  3. special_tokens_map.json +14 -2
  4. tokenizer_config.json +44 -2
  5. vocab.json +13 -0
README.md CHANGED
@@ -1,4 +1,3 @@
1
-
2
  ---
3
  license: cc-by-nc-4.0
4
  tags:
 
 
1
  ---
2
  license: cc-by-nc-4.0
3
  tags:
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "ড়": 87,
3
+ "ঢ়": 88,
4
+ "য়": 89
5
+ }
special_tokens_map.json CHANGED
@@ -1,4 +1,16 @@
1
  {
2
- "pad_token": "6",
3
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
 
1
  {
2
+ "pad_token": {
3
+ "content": "6",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "unk_token": {
10
+ "content": "<unk>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ }
16
  }
tokenizer_config.json CHANGED
@@ -1,10 +1,52 @@
1
  {
2
- "add_blank": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "clean_up_tokenization_spaces": true,
4
  "is_uroman": false,
5
  "language": "ben",
6
  "model_max_length": 1000000000000000019884624838656,
7
- "normalize": true,
8
  "pad_token": "6",
9
  "phonemize": false,
10
  "tokenizer_class": "VitsTokenizer",
 
1
  {
2
+ "add_blank": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "6",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "74": {
13
+ "content": "<unk>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "87": {
21
+ "content": "ড়",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": false
27
+ },
28
+ "88": {
29
+ "content": "ঢ়",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": false
35
+ },
36
+ "89": {
37
+ "content": "য়",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ }
44
+ },
45
  "clean_up_tokenization_spaces": true,
46
  "is_uroman": false,
47
  "language": "ben",
48
  "model_max_length": 1000000000000000019884624838656,
49
+ "normalize": false,
50
  "pad_token": "6",
51
  "phonemize": false,
52
  "tokenizer_class": "VitsTokenizer",
vocab.json CHANGED
@@ -40,7 +40,9 @@
40
  "ট": 55,
41
  "ঠ": 73,
42
  "ড": 21,
 
43
  "ঢ": 22,
 
44
  "ণ": 51,
45
  "ত": 42,
46
  "থ": 14,
@@ -53,6 +55,7 @@
53
  "ভ": 16,
54
  "ম": 41,
55
  "য": 38,
 
56
  "র": 69,
57
  "ল": 10,
58
  "শ": 48,
@@ -72,5 +75,15 @@
72
  "ৌ": 44,
73
  "্": 36,
74
  "ৎ": 52,
 
 
 
 
 
 
 
 
 
 
75
  "—": 32
76
  }
 
40
  "ট": 55,
41
  "ঠ": 73,
42
  "ড": 21,
43
+ "ড়": 84,
44
  "ঢ": 22,
45
+ "ঢ়": 85,
46
  "ণ": 51,
47
  "ত": 42,
48
  "থ": 14,
 
55
  "ভ": 16,
56
  "ম": 41,
57
  "য": 38,
58
+ "য়": 86,
59
  "র": 69,
60
  "ল": 10,
61
  "শ": 48,
 
75
  "ৌ": 44,
76
  "্": 36,
77
  "ৎ": 52,
78
+ "০": 74,
79
+ "১": 75,
80
+ "২": 76,
81
+ "৩": 77,
82
+ "৪": 78,
83
+ "৫": 79,
84
+ "৬": 80,
85
+ "৭": 81,
86
+ "৮": 82,
87
+ "৯": 83,
88
  "—": 32
89
  }