hezarai
/

distilbert-fa-pos-lscp-500k

arxyzan commited on Oct 1, 2023

Commit

aa0f586

•

1 Parent(s): d7c677f

Hezar: Upload tokenizer_config.yaml

Files changed (1) hide show

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,6 +1,5 @@
 name: wordpiece_tokenizer
 config_type: preprocessor
-pretrained_path: hezarai/distilbert-base-fa
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
@@ -8,16 +7,12 @@ stride: 0
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
-pad_token_id: 0
-pad_token: '[PAD]'
 pad_token_type_id: 0
 unk_token: '[UNK]'
-special_tokens:
-- '[UNK]'
-- '[SEP]'
-- '[CLS]'
-- '[PAD]'
-- '[MASK]'
 wordpieces_prefix: '##'
 vocab_size: 30000
 min_frequency: 2

 name: wordpiece_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
 unk_token: '[UNK]'
+sep_token: '[SEP]'
+pad_token: '[PAD]'
+cls_token: '[CLS]'
+mask_token: '[MASK]'
 wordpieces_prefix: '##'
 vocab_size: 30000
 min_frequency: 2