NeMo / examples /asr /conf /config.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: &name "QuartzNet15x5"
sample_rate: &sample_rate 16000
repeat: &repeat 1
dropout: &dropout 0.0
separable: &separable true
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
model:
train_ds:
manifest_filepath: ???
sample_rate: 16000
labels: *labels
batch_size: 32
trim_silence: True
max_duration: 16.7
shuffle: True
num_workers: 8
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: 16000
labels: *labels
batch_size: 32
shuffle: False
num_workers: 8
pin_memory: true
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.02
sample_rate: *sample_rate
window_stride: 0.01
window: "hann"
features: &n_mels 64
n_fft: 512
frame_splicing: 1
dither: 0.00001
stft_conv: false
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
rect_freq: 50
rect_masks: 5
rect_time: 120
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: relu
conv_mask: true
jasper:
- filters: 128
repeat: 1
kernel: [11]
stride: [1]
dilation: [1]
dropout: *dropout
residual: true
separable: *separable
se: true
se_context_size: -1
- filters: 256
repeat: *repeat
kernel: [13]
stride: [1]
dilation: [1]
dropout: *dropout
residual: true
separable: *separable
se: true
se_context_size: -1
- filters: 256
repeat: *repeat
kernel: [15]
stride: [1]
dilation: [1]
dropout: *dropout
residual: true
separable: *separable
se: true
se_context_size: -1
- filters: 256
repeat: *repeat
kernel: [17]
stride: [1]
dilation: [1]
dropout: *dropout
residual: true
separable: *separable
se: true
se_context_size: -1
- filters: 256
repeat: *repeat
kernel: [19]
stride: [1]
dilation: [1]
dropout: *dropout
residual: true
separable: *separable
se: true
se_context_size: -1
- filters: 256
repeat: 1
kernel: [21]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: *separable
se: true
se_context_size: -1
- filters: &enc_feat_out 1024
repeat: 1
kernel: [1]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: *separable
se: true
se_context_size: -1
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoder
feat_in: 1024
num_classes: 28
vocabulary: *labels
optim:
name: novograd
# _target_: nemo.core.optim.optimizers.Novograd
lr: .01
# optimizer arguments
betas: [0.8, 0.5]
weight_decay: 0.001
# scheduler setup
sched:
name: CosineAnnealing
# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false
# Scheduler params
warmup_steps: null
warmup_ratio: null
min_lr: 0.0
last_epoch: -1
trainer:
devices: 1 # number of gpus
max_epochs: 5
max_steps: -1 # computed at runtime if not set
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: False # Provided by exp_manager
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: True
create_checkpoint_callback: True