Gastron commited on
Commit
4c24ddc
1 Parent(s): e3f6bdc

Initial Upload

Browse files
Files changed (6) hide show
  1. README.md +24 -0
  2. config.json +4 -0
  3. hyperparams.yaml +166 -0
  4. model.ckpt +3 -0
  5. normalizer.ckpt +3 -0
  6. tokenizer.ckpt +3 -0
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: "fi"
3
+ thumbnail:
4
+ tags:
5
+ - automatic-speech-recognition
6
+ - Attention
7
+ - pytorch
8
+ - speechbrain
9
+
10
+ metrics:
11
+ - wer
12
+ - cer
13
+ ---
14
+
15
+ # Description
16
+ Attention-based Encoder-Decoder model trained on Puhelahjat (1500h colloquial Finnish donated by huge number of volunteers) and Finnish Parliament ASR Corpus (3000h speech from the sessions of the Finnish Parliament)
17
+ The Encoder is a CRDNN (Conv+LSTM+DNN), Decoder is GRU.
18
+
19
+ # Performance expectations
20
+ This is a relatively fast and compact model (~40M parameters), performance is not state-of-the-art.
21
+ This does not include a language model, the model is fully end-to-end.
22
+
23
+ This model should generalize to many types of speech. However, the model will also try to match colloquial speech (unlike some models which have learned to follow the written forms of Finnish).
24
+
config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "EncoderDecoderASR"
3
+ }
4
+
hyperparams.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with attention-based ASR
3
+ # Training data: All Finnish
4
+ # Encoder: CRDNN
5
+ # Decoder: GRU + beamsearch
6
+ # Authors: Aku Rouhe 2022
7
+ # ############################################################################
8
+
9
+ num_units: 5000
10
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
11
+
12
+ # Feature parameters
13
+ sample_rate: 16000
14
+ n_fft: 400
15
+ n_mels: 40
16
+
17
+ # Model parameters
18
+ activation: !name:torch.nn.LeakyReLU
19
+ dropout: 0.15
20
+ cnn_blocks: 2
21
+ cnn_channels: (64, 128)
22
+ inter_layer_pooling_size: (2, 2)
23
+ cnn_kernelsize: (3, 3)
24
+ time_pooling_size: 4
25
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
26
+ rnn_layers: 3
27
+ rnn_neurons: 512
28
+ rnn_bidirectional: True
29
+ dnn_blocks: 1
30
+ dnn_neurons: 512
31
+ emb_size: 128
32
+ dec_neurons: 1024
33
+ dec_layers: 1
34
+ output_neurons: !ref <num_units>
35
+
36
+ unk_index: 1
37
+ blank_index: 0
38
+ pad_index: 0
39
+ bos_index: 1
40
+ eos_index: 2
41
+
42
+ min_decode_ratio: 0.0
43
+ max_decode_ratio: 1.0
44
+ valid_beam_size: 4
45
+ test_beam_size: 8
46
+ eos_threshold: 1.2
47
+ using_max_attn_shift: False
48
+ max_attn_shift: 240
49
+ ctc_weight_decode: 0.0
50
+ coverage_penalty: 3.0
51
+ temperature: 1.5
52
+
53
+ # Feature extraction
54
+ compute_features: !new:speechbrain.lobes.features.Fbank
55
+ sample_rate: !ref <sample_rate>
56
+ n_fft: !ref <n_fft>
57
+ n_mels: !ref <n_mels>
58
+
59
+ # Feature normalization (mean and std)
60
+ normalize: !new:speechbrain.processing.features.InputNormalization
61
+ norm_type: global
62
+ update_until_epoch: -1
63
+
64
+ # The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs.
65
+ encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
66
+ input_shape: [null, null, !ref <n_mels>]
67
+ activation: !ref <activation>
68
+ dropout: !ref <dropout>
69
+ cnn_blocks: !ref <cnn_blocks>
70
+ cnn_channels: !ref <cnn_channels>
71
+ cnn_kernelsize: !ref <cnn_kernelsize>
72
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
73
+ time_pooling: True
74
+ using_2d_pooling: False
75
+ time_pooling_size: !ref <time_pooling_size>
76
+ rnn_class: !ref <rnn_class>
77
+ rnn_layers: !ref <rnn_layers>
78
+ rnn_neurons: !ref <rnn_neurons>
79
+ rnn_bidirectional: !ref <rnn_bidirectional>
80
+ rnn_re_init: True
81
+ dnn_blocks: !ref <dnn_blocks>
82
+ dnn_neurons: !ref <dnn_neurons>
83
+ use_rnnp: False
84
+
85
+ # Embedding (from indexes to an embedding space of dimension emb_size).
86
+ embedding: !new:speechbrain.nnet.embedding.Embedding
87
+ num_embeddings: !ref <output_neurons>
88
+ embedding_dim: !ref <emb_size>
89
+
90
+ # Attention-based RNN decoder.
91
+ decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
92
+ enc_dim: !ref <dnn_neurons>
93
+ input_size: !ref <emb_size>
94
+ rnn_type: gru
95
+ attn_type: location
96
+ hidden_size: !ref <dec_neurons>
97
+ attn_dim: 2048
98
+ num_layers: !ref <dec_layers>
99
+ scaling: 1.0
100
+ channels: 10
101
+ kernel_size: 100
102
+ re_init: True
103
+ dropout: !ref <dropout>
104
+
105
+ # Linear transformation on the top of the encoder.
106
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
107
+ input_size: !ref <dnn_neurons>
108
+ n_neurons: !ref <output_neurons>
109
+
110
+ # Linear transformation on the top of the decoder.
111
+ seq_lin: !new:speechbrain.nnet.linear.Linear
112
+ input_size: !ref <dec_neurons>
113
+ n_neurons: !ref <output_neurons>
114
+
115
+ # Final softmax (for log posteriors computation).
116
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
117
+ apply_log: True
118
+
119
+ # Cost definition for the CTC part.
120
+ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
121
+ blank_index: !ref <blank_index>
122
+
123
+ full_encode_step: !new:speechbrain.nnet.containers.LengthsCapableSequential
124
+ input_shape: [null, null, !ref <n_mels>]
125
+ compute_features: !ref <compute_features>
126
+ normalize: !ref <normalize>
127
+ model: !ref <encoder>
128
+
129
+ # Gathering all the submodels in a single model object.
130
+ model: !new:torch.nn.ModuleList
131
+ - - !ref <encoder>
132
+ - !ref <embedding>
133
+ - !ref <decoder>
134
+ - !ref <ctc_lin>
135
+ - !ref <seq_lin>
136
+
137
+ test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
138
+ embedding: !ref <embedding>
139
+ decoder: !ref <decoder>
140
+ linear: !ref <seq_lin>
141
+ ctc_linear: !ref <ctc_lin>
142
+ bos_index: !ref <bos_index>
143
+ eos_index: !ref <eos_index>
144
+ blank_index: !ref <blank_index>
145
+ min_decode_ratio: !ref <min_decode_ratio>
146
+ max_decode_ratio: !ref <max_decode_ratio>
147
+ beam_size: !ref <test_beam_size>
148
+ eos_threshold: !ref <eos_threshold>
149
+ using_max_attn_shift: !ref <using_max_attn_shift>
150
+ max_attn_shift: !ref <max_attn_shift>
151
+ coverage_penalty: !ref <coverage_penalty>
152
+ ctc_weight: !ref <ctc_weight_decode>
153
+ temperature: !ref <temperature>
154
+
155
+ # Objects in "modules" dict will have their parameters moved to the correct
156
+ # device, as well as having train()/eval() called on them by the Brain class
157
+ modules:
158
+ encoder: !ref <full_encode_step>
159
+ decoder: !ref <test_search>
160
+
161
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
162
+ loadables:
163
+ model: !ref <model>
164
+ normalizer: !ref <normalize>
165
+ tokenizer: !ref <tokenizer>
166
+
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d364b7506f4e74899a620caa2dbff8152d52b7ffcd1c20f99a29cde95b61637
3
+ size 185195486
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3da32bfaf668885250821f17c09091c05593ad87087ddfb88d63dc74abd735
3
+ size 1383
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd3d0616da87d358b9e2b3c17e4e4067ceaa8f11c1d29d499ba69f1f517fc06d
3
+ size 319229