asr-crdnn-rnnlm-commonvoice-10.0-de / tokenizer_hyperparams.yaml
sangeet2020's picture
Upload 8 files
39a3709
raw
history blame
1.46 kB
# Generated 2022-08-16 from:
# /netscratch/sagar/thesis/speechbrain/recipes/CommonVoice_de/Tokenizer/hparams/1K_unigram_subword_bpe.yaml
# yamllint disable
# ############################################################################
# Tokenizer: subword BPE with unigram 1K
# Training: German CommonVoice 1,211 hrs
# Authors: Abdel Heba 2021
# ############################################################################
token_type: unigram # ["unigram", "bpe", "char"]
output_folder: results/unigram
train_log: results/unigram/train_log.txt
# Data files
data_folder: ../CommonVoice/
csv_dir: ../cv_de_acc
train_tsv_file: ../CommonVoice//train.tsv
dev_tsv_file: ../CommonVoice//dev.tsv
test_tsv_file: ../CommonVoice//test.tsv
accented_letters: true
language: de
skip_prep: false
# train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
# dev_splits: ["dev-clean"]
# test_splits: ["test-clean", "test-other"]
train_csv: ../cv_de_acc/train.csv
valid_csv: ../cv_de_acc/dev.csv
# Training parameters
token_output: 5000 # index(blank/eos/bos/unk) = 0
character_coverage: 1.0
csv_read: wrd
tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
model_dir: results/unigram
vocab_size: 5000
annotation_train: ../cv_de_acc/train.csv
annotation_read: wrd
model_type: unigram # ["unigram", "bpe", "char"]
character_coverage: 1.0
annotation_list_to_check: [../cv_de_acc/train.csv, ../cv_de_acc/dev.csv]