File size: 4,900 Bytes
47ad4b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4e3f59
 
 
 
 
 
 
47ad4b9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 2024
__set_seed: !apply:torch.manual_seed [!ref <seed>]

skip_training: True

# Hparams NEEDED
HPARAMS_NEEDED: ["log_softmax"]
# Modules Needed
MODULES_NEEDED: ["whisper"]

output_folder: !ref output_folder_whisper
pretrained_path: Macedonian-ASR/whisper-large-v3-macedonian-asr
output_wer_folder: !ref <output_folder>/
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt


# URL for the biggest Fairseq english whisper model.
whisper_hub: openai/whisper-large-v3

# Normalize inputs with the same normalization done in the paper (https://cdn.openai.com/papers/whisper.pdf). Refer to Appendix C for further information.
normalized_transcripts: False
restore_capitalization: False

# Data files
language: "macedonian"
data_folder: "../../data/combined_data/speechbrain_splits"
accented_letters: True

ckpt_interval_minutes: 30 # save checkpoint every N min

####################### Training Parameters ####################################
freeze_whisper: False
freeze_encoder: True
number_of_epochs: 50
weight_decay: 0.01
lr_whisper: 1e-5
warmup_steps: 500
max_grad_norm: 2.0
precision: fp16 # bf16, fp16 or fp32
eval_precision: fp16
sample_rate: 16000

# With data_parallel batch_size is split into N jobs
batch_size: 6
test_batch_size: 1
grad_accumulation_factor: 2


# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
test_beam_size: 8

####################### Model Parameters #######################################
train_dataloader_opts:
    batch_size: !ref <batch_size>

valid_dataloader_opts:
    batch_size: !ref <batch_size>

test_dataloader_opts:
    batch_size: !ref <test_batch_size>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

############################## Augmentations ###################################

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
   orig_freq: 16000
   speeds: [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
   drop_freq_low: 0
   drop_freq_high: 1
   drop_freq_count_low: 1
   drop_freq_count_high: 3
   drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
   drop_length_low: 1000
   drop_length_high: 2000
   drop_count_low: 1
   drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
   concat_original: False
   min_augmentations: 1
   max_augmentations: 3
   augment_prob: 0.5
   augmentations: [
      !ref <speed_perturb>,
      !ref <drop_freq>,
      !ref <drop_chunk>]


############################## Models ##########################################


whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
    source: !ref <whisper_hub>
    freeze: !ref <freeze_whisper>
    freeze_encoder: !ref <freeze_encoder>
    save_path: !ref <save_folder>/whisper_checkpoint
    language: !ref <language>
    task: "transcribe"

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

nll_loss: !name:speechbrain.nnet.losses.nll_loss

modules:
    whisper: !ref <whisper>

############################## Decoding & optimiser ############################

whisper_opt_class: !name:torch.optim.AdamW
    lr: !ref <lr_whisper>
    weight_decay: !ref <weight_decay>

valid_search: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher
    model: !ref <whisper>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>

test_search: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher
    module: [!ref <whisper>]
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>

lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NoamScheduler
    lr_initial: !ref <lr_whisper>
    n_warmup_steps: !ref <warmup_steps>

############################## Logging and Pretrainer ##########################

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        whisper: !ref <whisper>
        scheduler_whisper: !ref <lr_annealing_whisper>
        counter: !ref <epoch_counter>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        whisper: !ref <whisper>
    paths:
        whisper: !ref <pretrained_path>/model.ckpt


train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
    split_tokens: True