NeMo

File size: 7,185 Bytes

7934b29

name: &name "ConvRNNTBPE5x1"

model:
  sample_rate: 16000
  compute_eval_loss: true

  tokenizer:
    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
    type: ???  # Can be either bpe or wpe

  train_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    labels: []
    shuffle: true
    num_workers: 8
    pin_memory: true
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 32
    shuffle: false
    labels: []
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: null
    sample_rate: ${model.sample_rate}
    batch_size: 32
    shuffle: false
    labels: []
    num_workers: 8
    pin_memory: true

  model_defaults:
    repeat: 5
    dropout: 0.0
    separable: true
    se: true
    se_context_size: -1
    # encoder / decoder / joint values
    enc_hidden: 1024
    pred_hidden: 320
    joint_hidden: 320

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: "per_feature"
    window_size: 0.02
    sample_rate: ${model.sample_rate}
    window_stride: 0.01
    window: "hann"
    features: &n_mels 80
    n_fft: 512
    frame_splicing: 1
    dither: 0.00001

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    rect_freq: 50
    rect_masks: 5
    rect_time: 120

  encoder:
    _target_: nemo.collections.asr.modules.ConvASREncoder
    feat_in: *n_mels
    activation: relu
    conv_mask: true

    jasper:
      - filters: 128
        repeat: 1
        kernel: [11]
        stride: [1]
        dilation: [1]
        dropout: ${model.model_defaults.dropout}
        residual: true
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}

      - filters: 256
        repeat: ${model.model_defaults.repeat}
        kernel: [13]
        stride: [2]
        dilation: [1]
        dropout: ${model.model_defaults.dropout}
        residual: true
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}
        stride_last: true
        residual_mode: "stride_add"

      - filters: 256
        repeat: ${model.model_defaults.repeat}
        kernel: [15]
        stride: [1]
        dilation: [1]
        dropout: ${model.model_defaults.dropout}
        residual: true
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}

      - filters: 256
        repeat: ${model.model_defaults.repeat}
        kernel: [17]
        stride: [2]
        dilation: [1]
        dropout: ${model.model_defaults.dropout}
        residual: true
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}
        stride_last: true
        residual_mode: "stride_add"

      - filters: 256
        repeat: ${model.model_defaults.repeat}
        kernel: [19]
        stride: [1]
        dilation: [1]
        dropout: ${model.model_defaults.dropout}
        residual: true
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}

      - filters: 256
        repeat: 1
        kernel: [21]
        stride: [2]
        dilation: [1]
        dropout: 0.0
        residual: false
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}
        stride_last: true
        residual_mode: "stride_add"

      - filters: ${model.model_defaults.enc_hidden}
        repeat: 1
        kernel: [1]
        stride: [1]
        dilation: [1]
        dropout: 0.0
        residual: false
        separable: ${model.model_defaults.separable}
        se: ${model.model_defaults.se}
        se_context_size: ${model.model_defaults.se_context_size}

  decoder:
    _target_: nemo.collections.asr.modules.RNNTDecoder
    normalization_mode: null
    random_state_sampling: false
    blank_as_pad: true

    prednet:
      pred_hidden: ${model.model_defaults.pred_hidden}
      pred_rnn_layers: 1
      t_max: null
      dropout: 0.0

  joint:
    _target_: nemo.collections.asr.modules.RNNTJoint
    log_softmax: null  # sets it according to cpu/gpu device

    # fused mode
    fuse_loss_wer: false
    fused_batch_size: 1

    jointnet:
      joint_hidden: ${model.model_defaults.joint_hidden}
      activation: "relu"
      dropout: 0.0

  decoding:
    strategy: "greedy_batch"

    # greedy strategy config
    greedy:
      max_symbols: 10

    # beam strategy config
    beam:
      beam_size: 2
      score_norm: true
      softmax_temperature: 1.0  # scale the logits by some temperature prior to softmax
      tsd_max_sym_exp: 10  # for Time Synchronous Decoding, int > 0
      alsd_max_target_len: 5.0  # for Alignment-Length Synchronous Decoding, float > 1.0
      maes_num_steps: 2  # for modified Adaptive Expansion Search, int > 0
      maes_prefix_alpha: 1  # for modified Adaptive Expansion Search, int > 0
      maes_expansion_beta: 2  # for modified Adaptive Expansion Search, int >= 0
      maes_expansion_gamma: 2.3  # for modified Adaptive Expansion Search, float >= 0

  loss:
    loss_name: "default"
    warprnnt_numba_kwargs:
      fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
      clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.

  optim:
    name: adam
    # _target_: nemo.core.optim.optimizers.Adam
    lr: .1

    # optimizer arguments
    betas: [0.9, 0.999]
    weight_decay: 0.0001

    # scheduler setup
    sched:
      name: CosineAnnealing

      # scheduler config override
      warmup_steps: null
      warmup_ratio: 0.05
      min_lr: 1e-6
      last_epoch: -1

trainer:
  devices: 1 # number of gpus
  max_epochs: 5
  max_steps: -1 # computed at runtime if not set
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  precision: 32
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training

exp_manager:
  exp_dir: null
  name: *name
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
  create_wandb_logger: False
  wandb_logger_kwargs:
    name: null
    project: null