name: wordpiece_tokenizer | |
config_type: preprocessor | |
pretrained_path: hezarai/bert-base-fa | |
max_length: 512 | |
truncation_strategy: longest_first | |
truncation_direction: right | |
stride: 0 | |
padding_strategy: longest | |
padding_direction: right | |
pad_to_multiple_of: 0 | |
pad_token_id: 0 | |
pad_token: '[PAD]' | |
pad_token_type_id: 0 | |
unk_token: '[UNK]' | |
special_tokens: | |
- '[UNK]' | |
- '[SEP]' | |
- '[CLS]' | |
- '[PAD]' | |
- '[MASK]' | |
wordpieces_prefix: '##' | |
train_config: | |
name: wordpiece_tokenizer | |
config_type: preprocessor | |
vocab_size: 30000 | |
min_frequency: 2 | |
limit_alphabet: 1000 | |
initial_alphabet: [] | |
show_progress: true | |