NeMo / examples /asr /conf /contextnet_rnnt /config_rnnt_bpe.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: &name "ConvRNNTBPE5x1"
model:
sample_rate: 16000
compute_eval_loss: true
tokenizer:
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
type: ??? # Can be either bpe or wpe
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 32
trim_silence: true
max_duration: 16.7
labels: []
shuffle: true
num_workers: 8
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 32
shuffle: false
labels: []
num_workers: 8
pin_memory: true
test_ds:
manifest_filepath: null
sample_rate: ${model.sample_rate}
batch_size: 32
shuffle: false
labels: []
num_workers: 8
pin_memory: true
model_defaults:
repeat: 5
dropout: 0.0
separable: true
se: true
se_context_size: -1
# encoder / decoder / joint values
enc_hidden: 1024
pred_hidden: 320
joint_hidden: 320
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.02
sample_rate: ${model.sample_rate}
window_stride: 0.01
window: "hann"
features: &n_mels 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
rect_freq: 50
rect_masks: 5
rect_time: 120
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: relu
conv_mask: true
jasper:
- filters: 128
repeat: 1
kernel: [11]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: 256
repeat: ${model.model_defaults.repeat}
kernel: [13]
stride: [2]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
- filters: 256
repeat: ${model.model_defaults.repeat}
kernel: [15]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: 256
repeat: ${model.model_defaults.repeat}
kernel: [17]
stride: [2]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
- filters: 256
repeat: ${model.model_defaults.repeat}
kernel: [19]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: 256
repeat: 1
kernel: [21]
stride: [2]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
- filters: ${model.model_defaults.enc_hidden}
repeat: 1
kernel: [1]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
decoder:
_target_: nemo.collections.asr.modules.RNNTDecoder
normalization_mode: null
random_state_sampling: false
blank_as_pad: true
prednet:
pred_hidden: ${model.model_defaults.pred_hidden}
pred_rnn_layers: 1
t_max: null
dropout: 0.0
joint:
_target_: nemo.collections.asr.modules.RNNTJoint
log_softmax: null # sets it according to cpu/gpu device
# fused mode
fuse_loss_wer: false
fused_batch_size: 1
jointnet:
joint_hidden: ${model.model_defaults.joint_hidden}
activation: "relu"
dropout: 0.0
decoding:
strategy: "greedy_batch"
# greedy strategy config
greedy:
max_symbols: 10
# beam strategy config
beam:
beam_size: 2
score_norm: true
softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax
tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0
alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0
maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0
maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0
maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0
maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0
loss:
loss_name: "default"
warprnnt_numba_kwargs:
fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
optim:
name: adam
# _target_: nemo.core.optim.optimizers.Adam
lr: .1
# optimizer arguments
betas: [0.9, 0.999]
weight_decay: 0.0001
# scheduler setup
sched:
name: CosineAnnealing
# scheduler config override
warmup_steps: null
warmup_ratio: 0.05
min_lr: 1e-6
last_epoch: -1
trainer:
devices: 1 # number of gpus
max_epochs: 5
max_steps: -1 # computed at runtime if not set
num_nodes: 1
accelerator: gpu
strategy: ddp
precision: 32
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: False # Provided by exp_manager
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: "val_wer"
mode: "min"
create_wandb_logger: False
wandb_logger_kwargs:
name: null
project: null