File size: 4,881 Bytes
4c24ddc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# ############################################################################
# Model: E2E ASR with attention-based ASR
# Training data: All Finnish
# Encoder: CRDNN
# Decoder: GRU + beamsearch
# Authors: Aku Rouhe 2022
# ############################################################################
num_units: 5000
tokenizer: !new:sentencepiece.SentencePieceProcessor
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40
# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (64, 128)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 3
rnn_neurons: 512
rnn_bidirectional: True
dnn_blocks: 1
dnn_neurons: 512
emb_size: 128
dec_neurons: 1024
dec_layers: 1
output_neurons: !ref <num_units>
unk_index: 1
blank_index: 0
pad_index: 0
bos_index: 1
eos_index: 2
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 4
test_beam_size: 8
eos_threshold: 1.2
using_max_attn_shift: False
max_attn_shift: 240
ctc_weight_decode: 0.0
coverage_penalty: 3.0
temperature: 1.5
# Feature extraction
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
# Feature normalization (mean and std)
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: -1
# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs.
encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
input_shape: [null, null, !ref <n_mels>]
activation: !ref <activation>
dropout: !ref <dropout>
cnn_blocks: !ref <cnn_blocks>
cnn_channels: !ref <cnn_channels>
cnn_kernelsize: !ref <cnn_kernelsize>
inter_layer_pooling_size: !ref <inter_layer_pooling_size>
time_pooling: True
using_2d_pooling: False
time_pooling_size: !ref <time_pooling_size>
rnn_class: !ref <rnn_class>
rnn_layers: !ref <rnn_layers>
rnn_neurons: !ref <rnn_neurons>
rnn_bidirectional: !ref <rnn_bidirectional>
rnn_re_init: True
dnn_blocks: !ref <dnn_blocks>
dnn_neurons: !ref <dnn_neurons>
use_rnnp: False
# Embedding (from indexes to an embedding space of dimension emb_size).
embedding: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
embedding_dim: !ref <emb_size>
# Attention-based RNN decoder.
decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: !ref <dnn_neurons>
input_size: !ref <emb_size>
rnn_type: gru
attn_type: location
hidden_size: !ref <dec_neurons>
attn_dim: 2048
num_layers: !ref <dec_layers>
scaling: 1.0
channels: 10
kernel_size: 100
re_init: True
dropout: !ref <dropout>
# Linear transformation on the top of the encoder.
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
# Linear transformation on the top of the decoder.
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <output_neurons>
# Final softmax (for log posteriors computation).
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
# Cost definition for the CTC part.
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>
full_encode_step: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalize>
model: !ref <encoder>
# Gathering all the submodels in a single model object.
model: !new:torch.nn.ModuleList
- - !ref <encoder>
- !ref <embedding>
- !ref <decoder>
- !ref <ctc_lin>
- !ref <seq_lin>
test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <embedding>
decoder: !ref <decoder>
linear: !ref <seq_lin>
ctc_linear: !ref <ctc_lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
eos_threshold: !ref <eos_threshold>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
coverage_penalty: !ref <coverage_penalty>
ctc_weight: !ref <ctc_weight_decode>
temperature: !ref <temperature>
# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class
modules:
encoder: !ref <full_encode_step>
decoder: !ref <test_search>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
normalizer: !ref <normalize>
tokenizer: !ref <tokenizer>
|