File size: 4,900 Bytes
47ad4b9 a4e3f59 47ad4b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 2024
__set_seed: !apply:torch.manual_seed [!ref <seed>]
skip_training: True
# Hparams NEEDED
HPARAMS_NEEDED: ["log_softmax"]
# Modules Needed
MODULES_NEEDED: ["whisper"]
output_folder: !ref output_folder_whisper
pretrained_path: Macedonian-ASR/whisper-large-v3-macedonian-asr
output_wer_folder: !ref <output_folder>/
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# URL for the biggest Fairseq english whisper model.
whisper_hub: openai/whisper-large-v3
# Normalize inputs with the same normalization done in the paper (https://cdn.openai.com/papers/whisper.pdf). Refer to Appendix C for further information.
normalized_transcripts: False
restore_capitalization: False
# Data files
language: "macedonian"
data_folder: "../../data/combined_data/speechbrain_splits"
accented_letters: True
ckpt_interval_minutes: 30 # save checkpoint every N min
####################### Training Parameters ####################################
freeze_whisper: False
freeze_encoder: True
number_of_epochs: 50
weight_decay: 0.01
lr_whisper: 1e-5
warmup_steps: 500
max_grad_norm: 2.0
precision: fp16 # bf16, fp16 or fp32
eval_precision: fp16
sample_rate: 16000
# With data_parallel batch_size is split into N jobs
batch_size: 6
test_batch_size: 1
grad_accumulation_factor: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
test_beam_size: 8
####################### Model Parameters #######################################
train_dataloader_opts:
batch_size: !ref <batch_size>
valid_dataloader_opts:
batch_size: !ref <batch_size>
test_dataloader_opts:
batch_size: !ref <test_batch_size>
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
############################## Augmentations ###################################
# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: 16000
speeds: [95, 100, 105]
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
concat_original: False
min_augmentations: 1
max_augmentations: 3
augment_prob: 0.5
augmentations: [
!ref <speed_perturb>,
!ref <drop_freq>,
!ref <drop_chunk>]
############################## Models ##########################################
whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
source: !ref <whisper_hub>
freeze: !ref <freeze_whisper>
freeze_encoder: !ref <freeze_encoder>
save_path: !ref <save_folder>/whisper_checkpoint
language: !ref <language>
task: "transcribe"
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
nll_loss: !name:speechbrain.nnet.losses.nll_loss
modules:
whisper: !ref <whisper>
############################## Decoding & optimiser ############################
whisper_opt_class: !name:torch.optim.AdamW
lr: !ref <lr_whisper>
weight_decay: !ref <weight_decay>
valid_search: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher
model: !ref <whisper>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
test_search: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher
module: [!ref <whisper>]
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NoamScheduler
lr_initial: !ref <lr_whisper>
n_warmup_steps: !ref <warmup_steps>
############################## Logging and Pretrainer ##########################
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
whisper: !ref <whisper>
scheduler_whisper: !ref <lr_annealing_whisper>
counter: !ref <epoch_counter>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
whisper: !ref <whisper>
paths:
whisper: !ref <pretrained_path>/model.ckpt
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: True
|