# It contains the default values for training an LSTM-CTC ASR model, large size (~170M for bidirectional and ~130M for unidirectional) with CTC loss and sub-word encoding.

# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.

# Followed the architecture suggested in the following paper:
# 'STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES' by Yanzhang He et al. (https://arxiv.org/pdf/1811.06621.pdf)

# You may find more info about LSTM-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#lstm-transducer
# Pre-trained models of LSTM-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html

name: "LSTM-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true # enables logging sample predictions in the output during training
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
    min_duration: 0.1
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    num_workers: 4
    pin_memory: true
    use_start_end_token: false

  test_ds:
    manifest_filepath: null
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    num_workers: 4
    pin_memory: true
    use_start_end_token: false

  # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
  tokenizer:
    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2 # set to zero to disable it
    time_masks: 10 # set to zero to disable it
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.RNNEncoder
    feat_in: ${model.preprocessor.features}
    n_layers: 8
    d_model: 2048
    proj_size: 640 # you may set it if you need different output size other than the default d_model
    rnn_type: "lstm" # it can be lstm, gru or rnn
    bidirectional: true # need to set it to false if you want to make the model causal

    # Sub-sampling params
    subsampling: stacking # stacking, vggnet or striding
    subsampling_factor: 4
    subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model

    ### regularization
    dropout: 0.2 # The dropout used in most of the Conformer Modules

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  optim:
    name: adamw
    lr: 5.0
    # optimizer arguments
    betas: [0.9, 0.98]
    weight_decay: 1e-2

    # scheduler setup
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      # scheduler config override
      warmup_steps: 10000
      warmup_ratio: null
      min_lr: 1e-6

trainer:
  devices: -1 # number of GPUs, -1 would use all available GPUs
  num_nodes: 1
  max_epochs: 500
  max_steps: -1 # computed at runtime if not set
  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  gradient_clip_val: 0.3
  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
  log_every_n_steps: 10  # Interval of logging.
  enable_progress_bar: True
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
  sync_batchnorm: true
  enable_checkpointing: False  # Provided by exp_manager
  logger: false  # Provided by exp_manager
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training


exp_manager:
  exp_dir: null
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    # in case of multiple validation sets, first one is used
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints

  # you need to set these two to True to continue the training
  resume_if_exists: false
  resume_ignore_no_checkpoint: false

  # You may use this section to create a W&B logger
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null