NeMo / examples /asr /conf /contextnet_rnnt /contextnet_rnnt_multilang.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config contains the default values for training a modified ContextNet model with Transducer loss and BPE-based vocabulary.
# It also uses the AggregateTokenizer, so that the model is trained on more than one language, one language per tokenizer
# In contrast to original ContextNet, the same number of filters is used throughout the model.
# Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs.
# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# It contains the default values for training a ContextNet ASR model, large size (~144M) with Transducer loss and sub-word encoding.
# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file.
#
# +-------------+---------+------------+
# | Model | filters | time_masks |
# +=============+=========+============+
# | Small (14M)| 256 | 2 |
# +-------------+---------+------------+
# | Medium (40M)| 512 | 5 |
# +-------------+---------+------------+
# | Large (145M)| 1024 | 10 |
# +-------------------------------------
name: &name "ContextNet-8x-Stride-RNNT-mla"
model:
sample_rate: 16000
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # Can be increased if memory allows or when using smaller model
trim_silence: false
max_duration: 16.7
shuffle: true
use_start_end_token: false
num_workers: 16
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: "scatter"
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 16
pin_memory: true
test_ds:
manifest_filepath: null
sample_rate: ${model.sample_rate}
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 16
pin_memory: true
model_defaults:
filters: 1024
repeat: 5
dropout: 0.1
separable: true
se: true
se_context_size: -1
kernel_size_factor: 1.0
# encoder / decoder / joint values
enc_hidden: 640
pred_hidden: 640
joint_hidden: 640
tokenizer:
type: agg # The AggregateTokenizer is an ordered dict of N monolingual tokenizers, one per language id
langs:
en: # this language id must match the 'lang' field for english samples in the manifest
dir: ??? # path to the en tokenizer which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
type: ??? # Can be either bpe or wpe
es: # this language id must match the 'lang' field for spanish samples in the manifest
dir: ??? # path to the es tokenizer which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
type: ??? # Can be either bpe or wpe
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "per_feature"
window_size: 0.025
window_stride: 0.01
window: "hann"
features: &n_mels 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
pad_to: 16
stft_conv: false
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 2 # should be kept at 2
time_masks: 10 # can be 5 for small-med models, 10 for larger models.
freq_width: 27
time_width: 0.05
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: swish
conv_mask: true
init_mode: "tds_uniform"
jasper:
- filters: ${model.model_defaults.filters}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # *stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.enc_hidden}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
decoder:
_target_: nemo.collections.asr.modules.RNNTDecoder
normalization_mode: null # Currently only null is supported for export.
random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
prednet:
pred_hidden: ${model.model_defaults.pred_hidden}
pred_rnn_layers: 1 # only 1 layer LSTM networks are exportable.
t_max: null # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default.
dropout: 0.1
joint:
_target_: nemo.collections.asr.modules.RNNTJoint
log_softmax: null # sets it according to cpu/gpu device
preserve_memory: false # dramatically slows down training, but might preserve some memory
# Fuses the computation of prediction net + joint net + loss + WER calculation
# to be run on sub-batches of size `fused_batch_size`.
# When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
# `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
# Using small values here will preserve a lot of memory during training, but will make training slower as well.
# An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
# However, to preserve memory, this ratio can be 1:8 or even 1:16.
# Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
fuse_loss_wer: true
fused_batch_size: 16
jointnet:
joint_hidden: ${model.model_defaults.joint_hidden}
activation: "relu"
dropout: 0.1
# RNNT decoding strategy
decoding:
strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
# greedy strategy config
greedy:
max_symbols: 10
# beam strategy config
beam:
beam_size: 4
score_norm: true
return_best_hypothesis: False
softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax
tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0
alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0
maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0
maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0
maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0
maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0
# RNNT loss config
loss:
loss_name: "default"
warprnnt_numba_kwargs:
# FastEmit regularization: https://arxiv.org/abs/2010.11148
fastemit_lambda: 0.001 # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start.
clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
optim:
name: novograd
lr: 0.05
# optimizer arguments
betas: [0.9, 0.0]
weight_decay: 0.001
# scheduler setup
sched:
name: CosineAnnealing
# scheduler config override
warmup_steps: 5000
warmup_ratio: null
min_lr: 1e-6
last_epoch: -1
trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
max_epochs: 100
max_steps: -1 # computed at runtime if not set
num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
check_val_every_n_epoch: 1 # RNNT decoding is slower than CTC, so eval takes longer. Increase value to speed up training slightly.
precision: 32 # RNNT requires a lot of memory, so precision 16 is very important. Use very small batch size for precision 32.
gradient_clip_val: 1.0 # Gradient norm clip value
sync_batchnorm: true
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: "val_wer"
mode: "min"
save_top_k: 3
always_save_nemo: true
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null
entity: null
resume_if_exists: false
resume_ignore_no_checkpoint: false