sangeet2020 commited on
Commit
11eb99c
1 Parent(s): 030551b

Create hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +157 -0
hyperparams.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with attention-based ASR
3
+ # Encoder: CRDNN model
4
+ # Decoder: GRU + beamsearch + RNNLM
5
+ # Tokens: BPE with unigram
6
+ # Authors: Sangeet Sagar 2022
7
+ # ############################################################################
8
+
9
+
10
+ # Feature parameters
11
+ sample_rate: 16000
12
+ n_fft: 400
13
+ n_mels: 40
14
+
15
+ # Model parameters
16
+ activation: !name:torch.nn.LeakyReLU
17
+ dropout: 0.15
18
+ cnn_blocks: 2
19
+ cnn_channels: (128, 256)
20
+ inter_layer_pooling_size: (2, 2)
21
+ cnn_kernelsize: (3, 3)
22
+ time_pooling_size: 4
23
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
24
+ rnn_layers: 4
25
+ rnn_neurons: 1024
26
+ rnn_bidirectional: True
27
+ dnn_blocks: 2
28
+ dnn_neurons: 512
29
+ emb_size: 128
30
+ dec_neurons: 1024
31
+ output_neurons: 1000 # index(blank/eos/bos) = 0
32
+ blank_index: 0
33
+
34
+ # Decoding parameters
35
+ bos_index: 0
36
+ eos_index: 0
37
+ min_decode_ratio: 0.0
38
+ max_decode_ratio: 1.0
39
+ beam_size: 80
40
+ eos_threshold: 1.5
41
+ using_max_attn_shift: True
42
+ max_attn_shift: 240
43
+ lm_weight: 0.50
44
+ coverage_penalty: 1.5
45
+ temperature: 1.25
46
+ temperature_lm: 1.25
47
+
48
+ normalizer: !new:speechbrain.processing.features.InputNormalization
49
+ norm_type: global
50
+
51
+ compute_features: !new:speechbrain.lobes.features.Fbank
52
+ sample_rate: !ref <sample_rate>
53
+ n_fft: !ref <n_fft>
54
+ n_mels: !ref <n_mels>
55
+
56
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
57
+ input_shape: [null, null, !ref <n_mels>]
58
+ activation: !ref <activation>
59
+ dropout: !ref <dropout>
60
+ cnn_blocks: !ref <cnn_blocks>
61
+ cnn_channels: !ref <cnn_channels>
62
+ cnn_kernelsize: !ref <cnn_kernelsize>
63
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
64
+ time_pooling: True
65
+ using_2d_pooling: False
66
+ time_pooling_size: !ref <time_pooling_size>
67
+ rnn_class: !ref <rnn_class>
68
+ rnn_layers: !ref <rnn_layers>
69
+ rnn_neurons: !ref <rnn_neurons>
70
+ rnn_bidirectional: !ref <rnn_bidirectional>
71
+ rnn_re_init: True
72
+ dnn_blocks: !ref <dnn_blocks>
73
+ dnn_neurons: !ref <dnn_neurons>
74
+
75
+ emb: !new:speechbrain.nnet.embedding.Embedding
76
+ num_embeddings: !ref <output_neurons>
77
+ embedding_dim: !ref <emb_size>
78
+
79
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
80
+ enc_dim: !ref <dnn_neurons>
81
+ input_size: !ref <emb_size>
82
+ rnn_type: gru
83
+ attn_type: location
84
+ hidden_size: !ref <dec_neurons>
85
+ attn_dim: 1024
86
+ num_layers: 1
87
+ scaling: 1.0
88
+ channels: 10
89
+ kernel_size: 100
90
+ re_init: True
91
+ dropout: !ref <dropout>
92
+
93
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
94
+ input_size: !ref <dnn_neurons>
95
+ n_neurons: !ref <output_neurons>
96
+
97
+ seq_lin: !new:speechbrain.nnet.linear.Linear
98
+ input_size: !ref <dec_neurons>
99
+ n_neurons: !ref <output_neurons>
100
+
101
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
102
+ apply_log: True
103
+
104
+ lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
105
+ output_neurons: !ref <output_neurons>
106
+ embedding_dim: !ref <emb_size>
107
+ activation: !name:torch.nn.LeakyReLU
108
+ dropout: 0.0
109
+ rnn_layers: 2
110
+ rnn_neurons: 2048
111
+ dnn_blocks: 1
112
+ dnn_neurons: 512
113
+ return_hidden: True # For inference
114
+
115
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
116
+
117
+ asr_model: !new:torch.nn.ModuleList
118
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
119
+
120
+ # We compose the inference (encoder) pipeline.
121
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
122
+ input_shape: [null, null, !ref <n_mels>]
123
+ compute_features: !ref <compute_features>
124
+ normalize: !ref <normalizer>
125
+ model: !ref <enc>
126
+
127
+ decoder: !new:speechbrain.decoders.S2SRNNBeamSearchLM
128
+ embedding: !ref <emb>
129
+ decoder: !ref <dec>
130
+ linear: !ref <seq_lin>
131
+ language_model: !ref <lm_model>
132
+ bos_index: !ref <bos_index>
133
+ eos_index: !ref <eos_index>
134
+ min_decode_ratio: !ref <min_decode_ratio>
135
+ max_decode_ratio: !ref <max_decode_ratio>
136
+ beam_size: !ref <beam_size>
137
+ eos_threshold: !ref <eos_threshold>
138
+ using_max_attn_shift: !ref <using_max_attn_shift>
139
+ max_attn_shift: !ref <max_attn_shift>
140
+ coverage_penalty: !ref <coverage_penalty>
141
+ lm_weight: !ref <lm_weight>
142
+ temperature: !ref <temperature>
143
+ temperature_lm: !ref <temperature_lm>
144
+
145
+
146
+ modules:
147
+ normalizer: !ref <normalizer>
148
+ encoder: !ref <encoder>
149
+ decoder: !ref <decoder>
150
+ lm_model: !ref <lm_model>
151
+
152
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
153
+ loadables:
154
+ normalizer: !ref <normalizer>
155
+ asr: !ref <asr_model>
156
+ lm: !ref <lm_model>
157
+ tokenizer: !ref <tokenizer>