Andreas Nautsch commited on
Commit
e25584b
1 Parent(s): 30255f3

Upload hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +120 -0
hyperparams.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: VGG2 + LSTM + time pooling
3
+ # Augmentation: SpecAugment
4
+ # Authors: Titouan Parcollet, Mirco Ravanelli, Peter Plantinga, Ju-Chieh Chou,
5
+ # and Abdel HEBA 2020
6
+ # ################################
7
+ # Feature parameters (FBANKS etc)
8
+ sample_rate: 16000
9
+ n_fft: 400
10
+ n_mels: 80
11
+ # Model parameters
12
+ activation: !name:torch.nn.LeakyReLU
13
+ dropout: 0.15
14
+ cnn_blocks: 3
15
+ cnn_channels: (128, 200, 256)
16
+ inter_layer_pooling_size: (2, 2, 2)
17
+ cnn_kernelsize: (3, 3)
18
+ time_pooling_size: 4
19
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
20
+ rnn_layers: 5
21
+ rnn_neurons: 1024
22
+ rnn_bidirectional: True
23
+ dnn_blocks: 2
24
+ dnn_neurons: 1024
25
+ emb_size: 128
26
+ dec_neurons: 1024
27
+ # Outputs
28
+ output_neurons: 500 # BPE size, index(blank/eos/bos) = 0
29
+ # Decoding parameters
30
+ # Be sure that the bos and eos index match with the BPEs ones
31
+ blank_index: 0
32
+ bos_index: 0
33
+ eos_index: 0
34
+ min_decode_ratio: 0.0
35
+ max_decode_ratio: 1.0
36
+ beam_size: 80
37
+ eos_threshold: 1.5
38
+ using_max_attn_shift: True
39
+ max_attn_shift: 140
40
+ ctc_weight_decode: 0.0
41
+ temperature: 1.50
42
+ normalizer: !new:speechbrain.processing.features.InputNormalization
43
+ norm_type: global
44
+ compute_features: !new:speechbrain.lobes.features.Fbank
45
+ sample_rate: !ref <sample_rate>
46
+ n_fft: !ref <n_fft>
47
+ n_mels: !ref <n_mels>
48
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
49
+ input_shape: [null, null, !ref <n_mels>]
50
+ activation: !ref <activation>
51
+ dropout: !ref <dropout>
52
+ cnn_blocks: !ref <cnn_blocks>
53
+ cnn_channels: !ref <cnn_channels>
54
+ cnn_kernelsize: !ref <cnn_kernelsize>
55
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
56
+ time_pooling: True
57
+ using_2d_pooling: False
58
+ time_pooling_size: !ref <time_pooling_size>
59
+ rnn_class: !ref <rnn_class>
60
+ rnn_layers: !ref <rnn_layers>
61
+ rnn_neurons: !ref <rnn_neurons>
62
+ rnn_bidirectional: !ref <rnn_bidirectional>
63
+ rnn_re_init: True
64
+ dnn_blocks: !ref <dnn_blocks>
65
+ dnn_neurons: !ref <dnn_neurons>
66
+ emb: !new:speechbrain.nnet.embedding.Embedding
67
+ num_embeddings: !ref <output_neurons>
68
+ embedding_dim: !ref <emb_size>
69
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
70
+ enc_dim: !ref <dnn_neurons>
71
+ input_size: !ref <emb_size>
72
+ rnn_type: gru
73
+ attn_type: location
74
+ hidden_size: 1024
75
+ attn_dim: 1024
76
+ num_layers: 1
77
+ scaling: 1.0
78
+ channels: 10
79
+ kernel_size: 100
80
+ re_init: True
81
+ dropout: !ref <dropout>
82
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
83
+ input_size: !ref <dnn_neurons>
84
+ n_neurons: !ref <output_neurons>
85
+ seq_lin: !new:speechbrain.nnet.linear.Linear
86
+ input_size: !ref <dec_neurons>
87
+ n_neurons: !ref <output_neurons>
88
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
89
+ apply_log: True
90
+ asr_model: !new:torch.nn.ModuleList
91
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
92
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
93
+ # We compose the inference (encoder) pipeline.
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ model: !ref <enc>
99
+ decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
100
+ embedding: !ref <emb>
101
+ decoder: !ref <dec>
102
+ linear: !ref <seq_lin>
103
+ bos_index: !ref <bos_index>
104
+ eos_index: !ref <eos_index>
105
+ min_decode_ratio: !ref <min_decode_ratio>
106
+ max_decode_ratio: !ref <max_decode_ratio>
107
+ beam_size: !ref <beam_size>
108
+ eos_threshold: !ref <eos_threshold>
109
+ using_max_attn_shift: !ref <using_max_attn_shift>
110
+ max_attn_shift: !ref <max_attn_shift>
111
+ temperature: !ref <temperature>
112
+ modules:
113
+ normalizer: !ref <normalizer>
114
+ encoder: !ref <encoder>
115
+ decoder: !ref <decoder>
116
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
117
+ loadables:
118
+ normalizer: !ref <normalizer>
119
+ asr: !ref <asr_model>
120
+ tokenizer: !ref <tokenizer>