# It contains the default values for training an LSTM-CTC ASR model, large size (~170M for bidirectional and ~130M for unidirectional) with CTC loss and sub-word encoding. # Architecture and training config: # Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # Followed the architecture suggested in the following paper: # 'STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES' by Yanzhang He et al. (https://arxiv.org/pdf/1811.06621.pdf) # You may find more info about LSTM-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#lstm-transducer # Pre-trained models of LSTM-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html name: "LSTM-CTC-BPE" model: sample_rate: 16000 log_prediction: true # enables logging sample predictions in the output during training ctc_reduction: 'mean_batch' skip_nan_grad: false train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: true num_workers: 4 pin_memory: true use_start_end_token: false trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets is_tarred: false tarred_audio_filepaths: null shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false num_workers: 4 pin_memory: true use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false num_workers: 4 pin_memory: true use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 pad_to: 0 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 # set to zero to disable it time_masks: 10 # set to zero to disable it freq_width: 27 time_width: 0.05 encoder: _target_: nemo.collections.asr.modules.RNNEncoder feat_in: ${model.preprocessor.features} n_layers: 8 d_model: 2048 proj_size: 640 # you may set it if you need different output size other than the default d_model rnn_type: "lstm" # it can be lstm, gru or rnn bidirectional: true # need to set it to false if you want to make the model causal # Sub-sampling params subsampling: stacking # stacking, vggnet or striding subsampling_factor: 4 subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model ### regularization dropout: 0.2 # The dropout used in most of the Conformer Modules decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder feat_in: null num_classes: -1 vocabulary: [] optim: name: adamw lr: 5.0 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-2 # scheduler setup sched: name: NoamAnnealing d_model: ${model.encoder.d_model} # scheduler config override warmup_steps: 10000 warmup_ratio: null min_lr: 1e-6 trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 500 max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: gpu strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.3 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs sync_batchnorm: true enable_checkpointing: False # Provided by exp_manager logger: false # Provided by exp_manager benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: # in case of multiple validation sets, first one is used monitor: "val_wer" mode: "min" save_top_k: 5 always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints # you need to set these two to True to continue the training resume_if_exists: false resume_ignore_no_checkpoint: false # You may use this section to create a W&B logger create_wandb_logger: false wandb_logger_kwargs: name: null project: null