name: &name "ConvRNNT5x1" model: sample_rate: 16000 compute_eval_loss: true labels: [ " ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'" ] train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 32 trim_silence: true max_duration: 16.7 labels: ${model.labels} shuffle: true num_workers: 8 pin_memory: true # tarred datasets is_tarred: false tarred_audio_filepaths: null tarred_shard_strategy: "scatter" shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 32 shuffle: false labels: ${model.labels} num_workers: 8 pin_memory: true test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 32 shuffle: false labels: ${model.labels} num_workers: 8 pin_memory: true model_defaults: repeat: 5 dropout: 0.0 separable: true se: true se_context_size: -1 # encoder / decoder / joint values enc_hidden: 1024 pred_hidden: 320 joint_hidden: 320 preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor normalize: "per_feature" window_size: 0.02 sample_rate: ${model.sample_rate} window_stride: 0.01 window: "hann" features: &n_mels 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation rect_freq: 50 rect_masks: 5 rect_time: 120 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: relu conv_mask: true jasper: - filters: 128 repeat: 1 kernel: [11] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: 256 repeat: ${model.model_defaults.repeat} kernel: [13] stride: [2] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" - filters: 256 repeat: ${model.model_defaults.repeat} kernel: [15] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: 256 repeat: ${model.model_defaults.repeat} kernel: [17] stride: [2] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" - filters: 256 repeat: ${model.model_defaults.repeat} kernel: [19] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: 256 repeat: 1 kernel: [21] stride: [2] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" - filters: ${model.model_defaults.enc_hidden} repeat: 1 kernel: [1] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} decoder: _target_: nemo.collections.asr.modules.RNNTDecoder normalization_mode: null random_state_sampling: false blank_as_pad: true prednet: pred_hidden: ${model.model_defaults.pred_hidden} pred_rnn_layers: 1 t_max: null dropout: 0.0 joint: _target_: nemo.collections.asr.modules.RNNTJoint log_softmax: null # sets it according to cpu/gpu device # fused mode fuse_loss_wer: false fused_batch_size: 1 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} activation: "relu" dropout: 0.0 decoding: strategy: "greedy_batch" # greedy strategy config greedy: max_symbols: 30 # beam strategy config beam: beam_size: 2 score_norm: true softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0 alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0 maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0 maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0 maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0 maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0 loss: loss_name: "default" warprnnt_numba_kwargs: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: name: adam # _target_: nemo.core.optim.optimizers.Adam lr: .1 # optimizer arguments betas: [0.9, 0.999] weight_decay: 0.0001 # scheduler setup sched: name: CosineAnnealing # scheduler config override warmup_steps: null warmup_ratio: 0.05 min_lr: 1e-6 last_epoch: -1 trainer: devices: 1 # number of gpus max_epochs: 5 max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp precision: 32 accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager log_every_n_steps: 1 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: *name create_tensorboard_logger: True create_checkpoint_callback: True checkpoint_callback_params: monitor: "val_wer" mode: "min" create_wandb_logger: False wandb_logger_kwargs: name: null project: null