goldfish-models commited on
Commit
71888fa
1 Parent(s): dd8b0bb

Upload knc_arab_10mb tokenizer.

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[XXXXX69]": 26541, "[XXXXX11]": 26483, "[XXXXX49]": 26521, "[XXXXX35]": 26507, "[XXXXX79]": 26551, "[XXXXX67]": 26539, "[XXXXX139]": 26611, "[XXXXX21]": 26493, "[XXXXX48]": 26520, "[XXXXX81]": 26553, "[XXXXX63]": 26535, "[XXXXX83]": 26555, "[XXXXX133]": 26605, "[XXXXX76]": 26548, "[XXXXX112]": 26584, "[CLS]": 26468, "[XXXXX60]": 26532, "[XXXXX131]": 26603, "[XXXXX126]": 26598, "[XXXXX37]": 26509, "[XXXXX46]": 26518, "[XXXXX62]": 26534, "[XXXXX106]": 26578, "[XXXXX58]": 26530, "[XXXXX72]": 26544, "[XXXXX130]": 26602, "[XXXXX100]": 26572, "[XXXXX74]": 26546, "[XXXXX18]": 26490, "[XXXXX140]": 26612, "[XXXXX121]": 26593, "[XXXXX111]": 26583, "[SEP]": 26469, "[XXXXX96]": 26568, "[XXXXX107]": 26579, "[XXXXX132]": 26604, "[XXXXX33]": 26505, "[XXXXX108]": 26580, "[XXXXX147]": 26619, "[XXXXX15]": 26487, "[XXXXX30]": 26502, "[XXXXX119]": 26591, "[XXXXX50]": 26522, "[XXXXX145]": 26617, "[XXXXX45]": 26517, "[XXXXX68]": 26540, "[XXXXX138]": 26610, "[XXXXX0]": 26472, "[XXXXX23]": 26495, "[XXXXX80]": 26552, "[XXXXX151]": 26623, "[XXXXX22]": 26494, "[XXXXX93]": 26565, "[XXXXX97]": 26569, "[XXXXX9]": 26481, "[XXXXX29]": 26501, "[XXXXX24]": 26496, "[XXXXX95]": 26567, "[XXXXX4]": 26476, "[XXXXX27]": 26499, "[XXXXX14]": 26486, "[XXXXX105]": 26577, "[XXXXX59]": 26531, "[XXXXX143]": 26615, "[XXXXX38]": 26510, "[XXXXX3]": 26475, "[XXXXX146]": 26618, "[XXXXX31]": 26503, "[XXXXX104]": 26576, "[XXXXX77]": 26549, "[XXXXX125]": 26597, "[XXXXX19]": 26491, "[XXXXX7]": 26479, "[XXXXX103]": 26575, "[XXXXX28]": 26500, "[XXXXX42]": 26514, "[XXXXX82]": 26554, "[XXXXX44]": 26516, "[XXXXX98]": 26570, "[XXXXX118]": 26590, "[XXXXX52]": 26524, "[XXXXX2]": 26474, "[XXXXX144]": 26616, "[XXXXX141]": 26613, "[MASK]": 26471, "[XXXXX136]": 26608, "[XXXXX99]": 26571, "[XXXXX53]": 26525, "[XXXXX110]": 26582, "[XXXXX116]": 26588, "[XXXXX12]": 26484, "[XXXXX117]": 26589, "[XXXXX91]": 26563, "[XXXXX36]": 26508, "[XXXXX85]": 26557, "[XXXXX56]": 26528, "[XXXXX73]": 26545, "[XXXXX101]": 26573, "[XXXXX75]": 26547, "[XXXXX92]": 26564, "[XXXXX20]": 26492, "[XXXXX6]": 26478, "[XXXXX113]": 26585, "[XXXXX65]": 26537, "[XXXXX26]": 26498, "[XXXXX109]": 26581, "[XXXXX70]": 26542, "[XXXXX40]": 26512, "[XXXXX78]": 26550, "[XXXXX86]": 26558, "[XXXXX134]": 26606, "[XXXXX123]": 26595, "[XXXXX39]": 26511, "[XXXXX84]": 26556, "[XXXXX89]": 26561, "[XXXXX10]": 26482, "[XXXXX137]": 26609, "[XXXXX64]": 26536, "[XXXXX142]": 26614, "[XXXXX148]": 26620, "[XXXXX114]": 26586, "[XXXXX55]": 26527, "[XXXXX94]": 26566, "[XXXXX88]": 26560, "[XXXXX66]": 26538, "[XXXXX71]": 26543, "[XXXXX32]": 26504, "[XXXXX129]": 26601, "[XXXXX13]": 26485, "[XXXXX34]": 26506, "[XXXXX102]": 26574, "[XXXXX47]": 26519, "[XXXXX61]": 26533, "[XXXXX135]": 26607, "<pad>": 26470, "[XXXXX120]": 26592, "[XXXXX17]": 26489, "[XXXXX127]": 26599, "[XXXXX90]": 26562, "[XXXXX5]": 26477, "[XXXXX51]": 26523, "[XXXXX57]": 26529, "[XXXXX122]": 26594, "[XXXXX41]": 26513, "[XXXXX25]": 26497, "[XXXXX1]": 26473, "[XXXXX124]": 26596, "[XXXXX87]": 26559, "[XXXXX54]": 26526, "[XXXXX8]": 26480, "[XXXXX149]": 26621, "[XXXXX150]": 26622, "[XXXXX43]": 26515, "[XXXXX115]": 26587, "[XXXXX128]": 26600, "[XXXXX16]": 26488}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}, "additional_special_tokens": ["[XXXXX0]", "[XXXXX1]", "[XXXXX2]", "[XXXXX3]", "[XXXXX4]", "[XXXXX5]", "[XXXXX6]", "[XXXXX7]", "[XXXXX8]", "[XXXXX9]", "[XXXXX10]", "[XXXXX11]", "[XXXXX12]", "[XXXXX13]", "[XXXXX14]", "[XXXXX15]", "[XXXXX16]", "[XXXXX17]", "[XXXXX18]", "[XXXXX19]", "[XXXXX20]", "[XXXXX21]", "[XXXXX22]", "[XXXXX23]", "[XXXXX24]", "[XXXXX25]", "[XXXXX26]", "[XXXXX27]", "[XXXXX28]", "[XXXXX29]", "[XXXXX30]", "[XXXXX31]", "[XXXXX32]", "[XXXXX33]", "[XXXXX34]", "[XXXXX35]", "[XXXXX36]", "[XXXXX37]", "[XXXXX38]", "[XXXXX39]", "[XXXXX40]", "[XXXXX41]", "[XXXXX42]", "[XXXXX43]", "[XXXXX44]", "[XXXXX45]", "[XXXXX46]", "[XXXXX47]", "[XXXXX48]", "[XXXXX49]", "[XXXXX50]", "[XXXXX51]", "[XXXXX52]", "[XXXXX53]", "[XXXXX54]", "[XXXXX55]", "[XXXXX56]", "[XXXXX57]", "[XXXXX58]", "[XXXXX59]", "[XXXXX60]", "[XXXXX61]", "[XXXXX62]", "[XXXXX63]", "[XXXXX64]", "[XXXXX65]", "[XXXXX66]", "[XXXXX67]", "[XXXXX68]", "[XXXXX69]", "[XXXXX70]", "[XXXXX71]", "[XXXXX72]", "[XXXXX73]", "[XXXXX74]", "[XXXXX75]", "[XXXXX76]", "[XXXXX77]", "[XXXXX78]", "[XXXXX79]", "[XXXXX80]", "[XXXXX81]", "[XXXXX82]", "[XXXXX83]", "[XXXXX84]", "[XXXXX85]", "[XXXXX86]", "[XXXXX87]", "[XXXXX88]", "[XXXXX89]", "[XXXXX90]", "[XXXXX91]", "[XXXXX92]", "[XXXXX93]", "[XXXXX94]", "[XXXXX95]", "[XXXXX96]", "[XXXXX97]", "[XXXXX98]", "[XXXXX99]", "[XXXXX100]", "[XXXXX101]", "[XXXXX102]", "[XXXXX103]", "[XXXXX104]", "[XXXXX105]", "[XXXXX106]", "[XXXXX107]", "[XXXXX108]", "[XXXXX109]", "[XXXXX110]", "[XXXXX111]", "[XXXXX112]", "[XXXXX113]", "[XXXXX114]", "[XXXXX115]", "[XXXXX116]", "[XXXXX117]", "[XXXXX118]", "[XXXXX119]", "[XXXXX120]", "[XXXXX121]", "[XXXXX122]", "[XXXXX123]", "[XXXXX124]", "[XXXXX125]", "[XXXXX126]", "[XXXXX127]", "[XXXXX128]", "[XXXXX129]", "[XXXXX130]", "[XXXXX131]", "[XXXXX132]", "[XXXXX133]", "[XXXXX134]", "[XXXXX135]", "[XXXXX136]", "[XXXXX137]", "[XXXXX138]", "[XXXXX139]", "[XXXXX140]", "[XXXXX141]", "[XXXXX142]", "[XXXXX143]", "[XXXXX144]", "[XXXXX145]", "[XXXXX146]", "[XXXXX147]", "[XXXXX148]", "[XXXXX149]", "[XXXXX150]", "[XXXXX151]"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:911cb4d3835bfc5777d41e391e26e23591a3bb7fa438f333a41be3afcae71f9a
3
+ size 662830
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false, "__type": "AddedToken"}, "sp_model_kwargs": {}, "name_or_path": "models/10mb/knc_arab_10mb", "model_input_names": ["input_ids", "attention_mask"], "special_tokens_map_file": "models/10mb/knc_arab_10mb/special_tokens_map.json", "tokenizer_class": "AlbertTokenizer"}