File size: 4,720 Bytes
e1faefa
 
 
1d790af
23ab61f
e1faefa
4bad4f3
369b729
2835954
 
4bad4f3
 
23ab61f
 
 
 
4ed44b9
23ab61f
 
 
 
 
9c63cd5
23ab61f
03d0e2a
 
23ab61f
 
 
 
 
 
2b61326
23ab61f
 
f00a2c7
2b61326
 
23ab61f
1d790af
23ab61f
 
19dfde0
1d790af
 
 
 
 
 
 
 
 
f56e17a
1d790af
f56e17a
 
1d790af
23ab61f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce7435b
 
23ab61f
 
 
 
 
 
 
1d790af
23ab61f
 
 
 
 
e305a79
 
 
1d790af
 
 
 
e305a79
 
 
 
 
1d790af
 
 
 
 
 
 
23ab61f
 
 
 
 
 
 
 
 
 
 
 
 
17fcbe5
23ab61f
 
 
 
e1faefa
 
 
1d790af
e1faefa
 
9b56dc9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Hparams NEEDED
HPARAMS_NEEDED: ["wav2vec_output_dim", "emb_size", "dec_neurons", "dec_layers", "output_neurons", "log_softmax", "tokenizer"]
# Modules Needed
MODULES_NEEDED: ["encoder_w2v2", "embedding", "ctc_lin", "seq_lin", "lm_model"]

# Pretrain folder (HuggingFace)
output_folder: !ref output_folder_seq2seq_cv_podcast_arhiv_augmentation
pretrained_path: Macedonian-ASR/wav2vec2-aed-macedonian-asr
# wav2vec2_hub: facebook/wav2vec2-large-xlsr-53
wav2vec2_hub: jonatasgrosman/wav2vec2-large-xlsr-53-russian
save_folder: !ref <output_folder>/save
wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint

####################### Training Parameters ####################################

####################### Model Parameters #######################################
dropout: 0.15
wav2vec_output_dim: 1024
emb_size: 128
dec_neurons: 1024
dec_layers: 1

output_neurons: 1000
blank_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 10
test_beam_size: 20
using_eos_threshold: True
eos_threshold: 1.5
using_max_attn_shift: False
max_attn_shift: 700
length_normalization: True
temperature: 1.0
temperature_lm: 1.4
# Scoring parameters
coverage_penalty: 1.5
lm_weight: 0.2


# This is the RNNLM that is used according to the Huggingface repository
# NB: It has to match the pre-trained RNNLM!!
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
  output_neurons: !ref <output_neurons>
  embedding_dim: !ref <emb_size>
  activation: !name:torch.nn.LeakyReLU
  dropout: 0.0
  rnn_layers: 3
  rnn_neurons: 2048
  dnn_blocks: 2
  dnn_neurons: 1024
  return_hidden: True  # For inference

# Wav2vec2 encoder
encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
    source: !ref <wav2vec2_hub>
    output_norm: True
    freeze: False
    freeze_feature_extractor: True
    save_path: !ref <wav2vec2_folder>
    output_all_hiddens: False

embedding: !new:speechbrain.nnet.embedding.Embedding
   num_embeddings: !ref <output_neurons>
   embedding_dim: !ref <emb_size>

# Attention-based RNN decoder.
decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
    enc_dim: !ref <wav2vec_output_dim>
    input_size: !ref <emb_size>
    rnn_type: gru
    attn_type: location
    hidden_size: !ref <dec_neurons>
    attn_dim: 512
    num_layers: !ref <dec_layers>
    scaling: 1.0
    channels: 10
    kernel_size: 100
    re_init: True
    dropout: !ref <dropout>

ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <wav2vec_output_dim>
   n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dec_neurons>
   n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True

tokenizer: !new:sentencepiece.SentencePieceProcessor
  model_file: 1000_unigram.model

modules:
   encoder_w2v2: !ref <encoder_w2v2>
   embedding: !ref <embedding>
   decoder: !ref <decoder>
   ctc_lin: !ref <ctc_lin>
   seq_lin: !ref <seq_lin>
   lm_model: !ref <lm_model>

model: !new:torch.nn.ModuleList
   - [!ref <encoder_w2v2>, !ref <embedding>, !ref <decoder>, !ref <ctc_lin>, !ref <seq_lin>]

############################## Decoding & optimiser ############################
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
  vocab_size: !ref <output_neurons>

rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
  language_model: !ref <lm_model>
  temperature: !ref <temperature_lm>

scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
  full_scorers: [!ref <coverage_scorer>]
  weights:
     coverage: !ref <coverage_penalty>

scorer_lm: !new:speechbrain.decoders.scorer.ScorerBuilder
  full_scorers: [!ref <rnnlm_scorer>,
                 !ref <coverage_scorer>]
  weights:
     rnnlm: !ref <lm_weight>
     coverage: !ref <coverage_penalty>

test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
    embedding: !ref <embedding>
    decoder: !ref <decoder>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>
    eos_threshold: !ref <eos_threshold>
    using_max_attn_shift: !ref <using_max_attn_shift>
    max_attn_shift: !ref <max_attn_shift>
    temperature: !ref <temperature>
    scorer: !ref <scorer>


############################## Logging and Pretrainer ##########################

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        model: !ref <model>
        lm: !ref <lm_model>
    paths:
        model: !ref <pretrained_path>/model.ckpt
        lm: !ref <pretrained_path>/lm.ckpt