File size: 4,900 Bytes

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 2024
__set_seed: !apply:torch.manual_seed [!ref <seed>]

skip_training: True

# Hparams NEEDED
HPARAMS_NEEDED: ["log_softmax"]
# Modules Needed
MODULES_NEEDED: ["whisper"]

output_folder: !ref output_folder_whisper
pretrained_path: Macedonian-ASR/whisper-large-v3-macedonian-asr
output_wer_folder: !ref <output_folder>/
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt


# URL for the biggest Fairseq english whisper model.
whisper_hub: openai/whisper-large-v3

# Normalize inputs with the same normalization done in the paper (https://cdn.openai.com/papers/whisper.pdf). Refer to Appendix C for further information.
normalized_transcripts: False
restore_capitalization: False

# Data files
language: "macedonian"
data_folder: "../../data/combined_data/speechbrain_splits"
accented_letters: True

ckpt_interval_minutes: 30 # save checkpoint every N min

####################### Training Parameters ####################################
freeze_whisper: False
freeze_encoder: True
number_of_epochs: 50
weight_decay: 0.01
lr_whisper: 1e-5
warmup_steps: 500
max_grad_norm: 2.0
precision: fp16 # bf16, fp16 or fp32
eval_precision: fp16
sample_rate: 16000

# With data_parallel batch_size is split into N jobs
batch_size: 6
test_batch_size: 1
grad_accumulation_factor: 2


# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
test_beam_size: 8

####################### Model Parameters #######################################
train_dataloader_opts:
    batch_size: !ref <batch_size>

valid_dataloader_opts:
    batch_size: !ref <batch_size>

test_dataloader_opts:
    batch_size: !ref <test_batch_size>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

############################## Augmentations ###################################

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
   orig_freq: 16000
   speeds: [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
   drop_freq_low: 0
   drop_freq_high: 1
   drop_freq_count_low: 1
   drop_freq_count_high: 3
   drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
   drop_length_low: 1000
   drop_length_high: 2000
   drop_count_low: 1
   drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
   concat_original: False
   min_augmentations: 1
   max_augmentations: 3
   augment_prob: 0.5
   augmentations: [
      !ref <speed_perturb>,
      !ref <drop_freq>,
      !ref <drop_chunk>]


############################## Models ##########################################


whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
    source: !ref <whisper_hub>
    freeze: !ref <freeze_whisper>
    freeze_encoder: !ref <freeze_encoder>
    save_path: !ref <save_folder>/whisper_checkpoint
    language: !ref <language>
    task: "transcribe"

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

nll_loss: !name:speechbrain.nnet.losses.nll_loss

modules:
    whisper: !ref <whisper>

############################## Decoding & optimiser ############################

whisper_opt_class: !name:torch.optim.AdamW
    lr: !ref <lr_whisper>
    weight_decay: !ref <weight_decay>

valid_search: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher
    model: !ref <whisper>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>

test_search: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher
    module: [!ref <whisper>]
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>

lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NoamScheduler
    lr_initial: !ref <lr_whisper>
    n_warmup_steps: !ref <warmup_steps>

############################## Logging and Pretrainer ##########################

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        whisper: !ref <whisper>
        scheduler_whisper: !ref <lr_annealing_whisper>
        counter: !ref <epoch_counter>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        whisper: !ref <whisper>
    paths:
        whisper: !ref <pretrained_path>/model.ckpt


train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
    split_tokens: True