|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name: &name "ContextNet-8x-Stride-RNNT-mla" |
|
|
|
model: |
|
sample_rate: 16000 |
|
compute_eval_loss: false |
|
|
|
train_ds: |
|
manifest_filepath: ??? |
|
sample_rate: ${model.sample_rate} |
|
batch_size: 16 |
|
trim_silence: false |
|
max_duration: 16.7 |
|
shuffle: true |
|
use_start_end_token: false |
|
num_workers: 16 |
|
pin_memory: true |
|
|
|
is_tarred: false |
|
tarred_audio_filepaths: null |
|
tarred_shard_strategy: "scatter" |
|
shuffle_n: 2048 |
|
|
|
bucketing_strategy: "synced_randomized" |
|
bucketing_batch_size: null |
|
validation_ds: |
|
manifest_filepath: ??? |
|
sample_rate: ${model.sample_rate} |
|
batch_size: 8 |
|
shuffle: false |
|
use_start_end_token: false |
|
num_workers: 16 |
|
pin_memory: true |
|
|
|
test_ds: |
|
manifest_filepath: null |
|
sample_rate: ${model.sample_rate} |
|
batch_size: 8 |
|
shuffle: false |
|
use_start_end_token: false |
|
num_workers: 16 |
|
pin_memory: true |
|
|
|
model_defaults: |
|
filters: 1024 |
|
repeat: 5 |
|
dropout: 0.1 |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
kernel_size_factor: 1.0 |
|
|
|
enc_hidden: 640 |
|
pred_hidden: 640 |
|
joint_hidden: 640 |
|
|
|
tokenizer: |
|
type: agg |
|
langs: |
|
en: |
|
dir: ??? |
|
type: ??? |
|
es: |
|
dir: ??? |
|
type: ??? |
|
|
|
preprocessor: |
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
sample_rate: ${model.sample_rate} |
|
normalize: "per_feature" |
|
window_size: 0.025 |
|
window_stride: 0.01 |
|
window: "hann" |
|
features: &n_mels 80 |
|
n_fft: 512 |
|
frame_splicing: 1 |
|
dither: 0.00001 |
|
pad_to: 16 |
|
stft_conv: false |
|
|
|
spec_augment: |
|
_target_: nemo.collections.asr.modules.SpectrogramAugmentation |
|
freq_masks: 2 |
|
time_masks: 10 |
|
freq_width: 27 |
|
time_width: 0.05 |
|
|
|
encoder: |
|
_target_: nemo.collections.asr.modules.ConvASREncoder |
|
feat_in: *n_mels |
|
activation: swish |
|
conv_mask: true |
|
init_mode: "tds_uniform" |
|
|
|
jasper: |
|
- filters: ${model.model_defaults.filters} |
|
repeat: 1 |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: 0.0 |
|
residual: false |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [2] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
stride_last: true |
|
residual_mode: "stride_add" |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [2] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
stride_last: true |
|
residual_mode: "stride_add" |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [2] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
stride_last: true |
|
residual_mode: "stride_add" |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
- filters: ${model.model_defaults.enc_hidden} |
|
repeat: 1 |
|
kernel: [5] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: 0.0 |
|
residual: false |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
|
|
|
|
|
decoder: |
|
_target_: nemo.collections.asr.modules.RNNTDecoder |
|
normalization_mode: null |
|
random_state_sampling: false |
|
blank_as_pad: true |
|
|
|
prednet: |
|
pred_hidden: ${model.model_defaults.pred_hidden} |
|
pred_rnn_layers: 1 |
|
t_max: null |
|
dropout: 0.1 |
|
|
|
joint: |
|
_target_: nemo.collections.asr.modules.RNNTJoint |
|
log_softmax: null |
|
preserve_memory: false |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fuse_loss_wer: true |
|
fused_batch_size: 16 |
|
|
|
jointnet: |
|
joint_hidden: ${model.model_defaults.joint_hidden} |
|
activation: "relu" |
|
dropout: 0.1 |
|
|
|
|
|
decoding: |
|
strategy: "greedy_batch" |
|
|
|
|
|
greedy: |
|
max_symbols: 10 |
|
|
|
|
|
beam: |
|
beam_size: 4 |
|
score_norm: true |
|
return_best_hypothesis: False |
|
softmax_temperature: 1.0 |
|
tsd_max_sym_exp: 10 |
|
alsd_max_target_len: 5.0 |
|
maes_num_steps: 2 |
|
maes_prefix_alpha: 1 |
|
maes_expansion_beta: 2 |
|
maes_expansion_gamma: 2.3 |
|
|
|
|
|
loss: |
|
loss_name: "default" |
|
|
|
warprnnt_numba_kwargs: |
|
|
|
fastemit_lambda: 0.001 |
|
clamp: -1.0 |
|
|
|
optim: |
|
name: novograd |
|
lr: 0.05 |
|
|
|
|
|
betas: [0.9, 0.0] |
|
weight_decay: 0.001 |
|
|
|
|
|
sched: |
|
name: CosineAnnealing |
|
|
|
|
|
warmup_steps: 5000 |
|
warmup_ratio: null |
|
min_lr: 1e-6 |
|
last_epoch: -1 |
|
|
|
trainer: |
|
devices: -1 |
|
max_epochs: 100 |
|
max_steps: -1 |
|
num_nodes: 1 |
|
accelerator: auto |
|
strategy: ddp |
|
accumulate_grad_batches: 1 |
|
enable_checkpointing: False |
|
logger: false |
|
log_every_n_steps: 100 |
|
val_check_interval: 1.0 |
|
check_val_every_n_epoch: 1 |
|
precision: 32 |
|
gradient_clip_val: 1.0 |
|
sync_batchnorm: true |
|
benchmark: false |
|
|
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: *name |
|
create_tensorboard_logger: true |
|
create_checkpoint_callback: true |
|
checkpoint_callback_params: |
|
monitor: "val_wer" |
|
mode: "min" |
|
save_top_k: 3 |
|
always_save_nemo: true |
|
create_wandb_logger: false |
|
wandb_logger_kwargs: |
|
name: null |
|
project: null |
|
entity: null |
|
resume_if_exists: false |
|
resume_ignore_no_checkpoint: false |
|
|