HamzaSidhu786 commited on
Commit
515814a
1 Parent(s): 4887973

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
- "</s>": 54,
3
- "<pad>": 56,
4
- "<s>": 53,
5
- "<unk>": 55
6
  }
 
1
  {
2
+ "</s>": 62,
3
+ "<s>": 61
 
 
4
  }
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
- "pad_token": "<pad>",
5
- "unk_token": "<unk>"
6
  }
 
1
  {
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
  }
tokenizer_config.json CHANGED
@@ -40,7 +40,7 @@
40
  "single_word": false,
41
  "special": false
42
  },
43
- "53": {
44
  "content": "<s>",
45
  "lstrip": false,
46
  "normalized": false,
@@ -48,29 +48,13 @@
48
  "single_word": false,
49
  "special": true
50
  },
51
- "54": {
52
  "content": "</s>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false,
57
  "special": true
58
- },
59
- "55": {
60
- "content": "<unk>",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- },
67
- "56": {
68
- "content": "<pad>",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false,
73
- "special": true
74
  }
75
  },
76
  "bos_token": "<s>",
@@ -78,10 +62,10 @@
78
  "do_lower_case": false,
79
  "eos_token": "</s>",
80
  "model_max_length": 1000000000000000019884624838656,
81
- "pad_token": "<pad>",
82
  "replace_word_delimiter_char": " ",
83
  "target_lang": null,
84
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
85
- "unk_token": "<unk>",
86
  "word_delimiter_token": "|"
87
  }
 
40
  "single_word": false,
41
  "special": false
42
  },
43
+ "61": {
44
  "content": "<s>",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "62": {
52
  "content": "</s>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false,
57
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  }
59
  },
60
  "bos_token": "<s>",
 
62
  "do_lower_case": false,
63
  "eos_token": "</s>",
64
  "model_max_length": 1000000000000000019884624838656,
65
+ "pad_token": "[PAD]",
66
  "replace_word_delimiter_char": " ",
67
  "target_lang": null,
68
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
69
+ "unk_token": "[UNK]",
70
  "word_delimiter_token": "|"
71
  }
vocab.json CHANGED
@@ -1,55 +1,63 @@
1
  {
2
- " ": 58,
3
  "[CLS]": 2,
4
  "[MASK]": 4,
5
  "[PAD]": 0,
6
  "[SEP]": 3,
7
  "[UNK]": 1,
8
- "ء": 43,
9
- "آ": 46,
10
- "أ": 34,
11
- "ؤ": 44,
12
- "إ": 45,
13
- "ئ": 39,
14
- "ا": 59,
15
- "ب": 55,
16
- "ة": 40,
17
- "ت": 7,
18
- "ث": 8,
19
- "ج": 9,
20
- "ح": 10,
21
- "خ": 11,
22
- "د": 12,
23
- "ذ": 13,
24
- "ر": 14,
25
- "ز": 15,
26
- "س": 56,
27
- "ش": 17,
28
- "ص": 18,
29
- "ض": 19,
30
- "ط": 20,
31
- "ظ": 21,
32
- "ع": 22,
33
- "غ": 23,
34
- "ف": 24,
35
- "ق": 25,
36
- "ك": 26,
37
- "ل": 61,
38
- "م": 57,
39
- "ن": 29,
40
- "ه": 62,
41
- "و": 31,
 
42
  "ى": 41,
43
- "ي": 32,
44
- "َ": 48,
45
- "ُ": 49,
46
- "ِ": 50,
47
- "ّ": 51,
48
- "ْ": 54,
49
- "ٔ": 47,
50
- "۝": 64,
51
- "۟": 53,
52
- "۩": 63,
53
- "": 66,
54
- "﴿": 65
 
 
 
 
 
 
 
55
  }
 
1
  {
2
+ " ": 5,
3
  "[CLS]": 2,
4
  "[MASK]": 4,
5
  "[PAD]": 0,
6
  "[SEP]": 3,
7
  "[UNK]": 1,
8
+ "ء": 6,
9
+ "آ": 7,
10
+ "أ": 8,
11
+ "ؤ": 9,
12
+ "إ": 10,
13
+ "ئ": 11,
14
+ "ا": 12,
15
+ "ب": 13,
16
+ "ة": 14,
17
+ "ت": 15,
18
+ "ث": 16,
19
+ "ج": 17,
20
+ "ح": 18,
21
+ "خ": 19,
22
+ "د": 20,
23
+ "ذ": 21,
24
+ "ر": 22,
25
+ "ز": 23,
26
+ "س": 24,
27
+ "ش": 25,
28
+ "ص": 26,
29
+ "ض": 27,
30
+ "ط": 28,
31
+ "ظ": 29,
32
+ "ع": 30,
33
+ "غ": 31,
34
+ "ـ": 32,
35
+ "ف": 33,
36
+ "ق": 34,
37
+ "ك": 35,
38
+ "ل": 36,
39
+ "م": 37,
40
+ "ن": 38,
41
+ "ه": 39,
42
+ "و": 40,
43
  "ى": 41,
44
+ "ي": 42,
45
+ "ً": 43,
46
+ "ٌ": 44,
47
+ "ٍ": 45,
48
+ "َ": 46,
49
+ "ُ": 47,
50
+ "ِ": 48,
51
+ "ّ": 49,
52
+ "ْ": 50,
53
+ "ٓ": 51,
54
+ "ٰ": 52,
55
+ "ۖ": 53,
56
+ "ۗ": 54,
57
+ "ۘ": 55,
58
+ "ۙ": 56,
59
+ "ۚ": 57,
60
+ "ۛ": 58,
61
+ "ۜ": 59,
62
+ "۩": 60
63
  }