Upload preprocessor with huggingface_hub

Files changed (2) hide show

preprocessor/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,12 +1,19 @@
-name: bpe_tokenizer
 config_type: preprocessor
-truncation_strategy: no_truncation
-padding_strategy: no_padding
-continuing_subword_prefix: ''
-end_of_word_suffix: ''
-fuse_unk: false
 train_config:
-  name: bpe_tokenizer
   config_type: preprocessor
   vocab_size: 30000
   min_frequency: 2

+name: wordpiece_tokenizer
 config_type: preprocessor
+max_length: 512
+truncation_strategy: longest_first
+truncation_direction: right
+stride: 0
+padding_strategy: longest
+padding_direction: right
+pad_to_multiple_of: 0
+pad_token_id: 0
+pad_token: '[PAD]'
+pad_token_type_id: 0
+unk_token: '[UNK]'
+wordpieces_prefix: '##'
 train_config:
+  name: wordpiece_tokenizer
   config_type: preprocessor
   vocab_size: 30000
   min_frequency: 2