# Seed needs to be set at top of yaml, before objects with parameters are made seed: 2024 __set_seed: !apply:torch.manual_seed [!ref ] skip_training: True # Hparams NEEDED HPARAMS_NEEDED: ["log_softmax"] # Modules Needed MODULES_NEEDED: ["whisper"] output_folder: !ref output_folder_whisper pretrained_path: Macedonian-ASR/whisper-large-v3-macedonian-asr output_wer_folder: !ref / save_folder: !ref /save train_log: !ref /train_log.txt # URL for the biggest Fairseq english whisper model. whisper_hub: openai/whisper-large-v3 # Normalize inputs with the same normalization done in the paper (https://cdn.openai.com/papers/whisper.pdf). Refer to Appendix C for further information. normalized_transcripts: False restore_capitalization: False # Data files language: "macedonian" data_folder: "../../data/combined_data/speechbrain_splits" accented_letters: True ckpt_interval_minutes: 30 # save checkpoint every N min ####################### Training Parameters #################################### freeze_whisper: False freeze_encoder: True number_of_epochs: 50 weight_decay: 0.01 lr_whisper: 1e-5 warmup_steps: 500 max_grad_norm: 2.0 precision: fp16 # bf16, fp16 or fp32 eval_precision: fp16 sample_rate: 16000 # With data_parallel batch_size is split into N jobs batch_size: 6 test_batch_size: 1 grad_accumulation_factor: 2 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 ####################### Model Parameters ####################################### train_dataloader_opts: batch_size: !ref valid_dataloader_opts: batch_size: !ref test_dataloader_opts: batch_size: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref ############################## Augmentations ################################### # Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: 16000 speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. drop_freq: !new:speechbrain.augment.time_domain.DropFreq drop_freq_low: 0 drop_freq_high: 1 drop_freq_count_low: 1 drop_freq_count_high: 3 drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. drop_chunk: !new:speechbrain.augment.time_domain.DropChunk drop_length_low: 1000 drop_length_high: 2000 drop_count_low: 1 drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter concat_original: False min_augmentations: 1 max_augmentations: 3 augment_prob: 0.5 augmentations: [ !ref , !ref , !ref ] ############################## Models ########################################## whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref freeze: !ref freeze_encoder: !ref save_path: !ref /whisper_checkpoint language: !ref task: "transcribe" log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True nll_loss: !name:speechbrain.nnet.losses.nll_loss modules: whisper: !ref ############################## Decoding & optimiser ############################ whisper_opt_class: !name:torch.optim.AdamW lr: !ref weight_decay: !ref valid_search: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher model: !ref min_decode_ratio: !ref max_decode_ratio: !ref test_search: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher module: [!ref ] min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref ############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: whisper: !ref scheduler_whisper: !ref counter: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: whisper: !ref paths: whisper: !ref /model.ckpt train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True