jfreiwa commited on
Commit
ca4cef6
·
1 Parent(s): 06c56c5

Upload hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +170 -0
hyperparams.yaml ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # model: Seq2Seq
3
+ # encoder: CRDNN model
4
+ # decoder: GRU + beamsearch
5
+ # tokens: BPE (unigram)
6
+ # losses: CTC+NLL
7
+ # training: Mozilla Common Voice 6.1, Spoken Wikipedia Corpus, M-AILABS Corpus
8
+ # authors: Ruhr-University Bochum 2021
9
+ # adapted from
10
+ # Ju-Chieh Chou,
11
+ # Mirco Ravanelli,
12
+ # Abdel Heba,
13
+ # Peter Plantinga,
14
+ # Samuele Cornell,
15
+ # Sung-Lin Yeh,
16
+ # Titouan Parcollet 2021
17
+ # ############################################################################
18
+
19
+ # set exp name
20
+ name: german-asr
21
+
22
+ # Feature parameters
23
+ sample_rate: 16000
24
+ n_fft: 400
25
+ n_mels: 40
26
+
27
+ # Model parameters
28
+ activation: !name:torch.nn.LeakyReLU
29
+ dropout: 0.15
30
+ cnn_blocks: 2
31
+ cnn_channels: (64, 128)
32
+ inter_layer_pooling_size: (2, 2)
33
+ cnn_kernelsize: (3, 3)
34
+ time_pooling_size: 4
35
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
36
+ rnn_layers: 4
37
+ rnn_neurons: 1024
38
+ rnn_bidirectional: True
39
+ dnn_blocks: 1
40
+ dnn_neurons: 1024
41
+ emb_size: 1024
42
+ dec_neurons: 1024
43
+ output_neurons: 5000 # Number of tokens (same as LM and tokenizer)
44
+
45
+
46
+ # Decoding parameters
47
+ blank_index: 0
48
+ pad_index: -1
49
+ bos_index: 1
50
+ eos_index: 2
51
+ unk_index: 0
52
+
53
+ min_decode_ratio: 0.0
54
+ max_decode_ratio: 1.0
55
+ beam_size: 30
56
+ eos_threshold: 1.5
57
+ using_max_attn_shift: True
58
+ max_attn_shift: 300
59
+ ctc_weight_decode: 0.3
60
+ ctc_window_size: 300
61
+ coverage_penalty: 1.5
62
+ temperature: 1.0
63
+
64
+
65
+ # Feature Extraction
66
+ normalizer: !new:speechbrain.processing.features.InputNormalization
67
+ norm_type: global
68
+
69
+ compute_features: !new:speechbrain.lobes.features.Fbank
70
+ sample_rate: !ref <sample_rate>
71
+ n_fft: !ref <n_fft>
72
+ n_mels: !ref <n_mels>
73
+
74
+ # Tokenizer
75
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
76
+
77
+ # Encoder
78
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
79
+ input_shape: [null, null, !ref <n_mels>]
80
+ activation: !ref <activation>
81
+ dropout: !ref <dropout>
82
+ cnn_blocks: !ref <cnn_blocks>
83
+ cnn_channels: !ref <cnn_channels>
84
+ cnn_kernelsize: !ref <cnn_kernelsize>
85
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
86
+ time_pooling: True
87
+ using_2d_pooling: False
88
+ time_pooling_size: !ref <time_pooling_size>
89
+ rnn_class: !ref <rnn_class>
90
+ rnn_layers: !ref <rnn_layers>
91
+ rnn_neurons: !ref <rnn_neurons>
92
+ rnn_bidirectional: !ref <rnn_bidirectional>
93
+ rnn_re_init: True
94
+ dnn_blocks: !ref <dnn_blocks>
95
+ dnn_neurons: !ref <dnn_neurons>
96
+ use_rnnp: True
97
+
98
+ # Decoder
99
+ emb: !new:speechbrain.nnet.embedding.Embedding
100
+ num_embeddings: !ref <output_neurons>
101
+ embedding_dim: !ref <emb_size>
102
+
103
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
104
+ enc_dim: !ref <dnn_neurons>
105
+ input_size: !ref <emb_size>
106
+ rnn_type: gru
107
+ attn_type: location
108
+ hidden_size: !ref <dec_neurons>
109
+ attn_dim: 1024
110
+ num_layers: 1
111
+ scaling: 1.0
112
+ channels: 10
113
+ kernel_size: 100
114
+ re_init: True
115
+ dropout: !ref <dropout>
116
+
117
+
118
+ # Losses
119
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
120
+ apply_log: True
121
+
122
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
123
+ input_size: !ref <dnn_neurons>
124
+ n_neurons: !ref <output_neurons>
125
+
126
+ seq_lin: !new:speechbrain.nnet.linear.Linear
127
+ input_size: !ref <dec_neurons>
128
+ n_neurons: !ref <output_neurons>
129
+
130
+
131
+ # Compile model
132
+ asr_model: !new:torch.nn.ModuleList
133
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
134
+
135
+ # We compose the inference (encoder) pipeline.
136
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
137
+ input_shape: [null, null, !ref <n_mels>]
138
+ compute_features: !ref <compute_features>
139
+ normalize: !ref <normalizer>
140
+ model: !ref <enc>
141
+
142
+ # Beam searcher
143
+ decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
144
+ embedding: !ref <emb>
145
+ decoder: !ref <dec>
146
+ linear: !ref <seq_lin>
147
+ bos_index: !ref <bos_index>
148
+ eos_index: !ref <eos_index>
149
+ min_decode_ratio: !ref <min_decode_ratio>
150
+ max_decode_ratio: !ref <max_decode_ratio>
151
+ beam_size: !ref <beam_size>
152
+ eos_threshold: !ref <eos_threshold>
153
+ using_max_attn_shift: !ref <using_max_attn_shift>
154
+ max_attn_shift: !ref <max_attn_shift>
155
+ temperature: !ref <temperature>
156
+
157
+ modules:
158
+ normalizer: !ref <normalizer>
159
+ encoder: !ref <encoder>
160
+ decoder: !ref <decoder>
161
+
162
+ # Load pretrained models
163
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
164
+ loadables:
165
+ asr: !ref <asr_model>
166
+ tokenizer: !ref <tokenizer>
167
+ normalizer: !ref <normalizer>
168
+
169
+
170
+