Upload preprocessor with huggingface_hub

Files changed (2) hide show

preprocessor/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor/tokenizer_config.yaml ADDED Viewed

+name: wordpiece_tokenizer
+config_type: preprocessor
+pretrained_path: hezar-ai/distilbert-fa
+max_length: 512
+truncation_strategy: longest_first
+truncation_direction: right
+stride: 0
+padding_strategy: longest
+padding_direction: right
+pad_to_multiple_of: 0
+pad_token_id: 0
+pad_token: '[PAD]'
+pad_token_type_id: 0
+unk_token: '[UNK]'
+special_tokens:
+- '[UNK]'
+- '[SEP]'
+- '[CLS]'
+- '[PAD]'
+- '[MASK]'
+wordpieces_prefix: '##'
+train_config:
+  name: wordpiece_tokenizer
+  config_type: preprocessor
+  vocab_size: 30000
+  min_frequency: 2
+  limit_alphabet: 1000
+  initial_alphabet: []
+  show_progress: true