File size: 4,104 Bytes
b67e798 8f98994 b67e798 a22900d b67e798 8f98994 b67e798 8f98994 b67e798 8f98994 b67e798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Tedlium2
# Authors: Adel Moumen 2023
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
win_length: 25
n_time_mask: 7
####################### Model parameters ###########################
# Transformer
d_model: 512
nhead: 8
num_encoder_layers: 18
num_decoder_layers: 6
csgu_linear_units: 3072
csgu_kernel_size: 31
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 500
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 20
ctc_weight_decode: 0.3
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
branchformer_activation: !ref <activation>
encoder_module: branchformer
csgu_linear_units: !ref <csgu_linear_units>
kernel_size: !ref <csgu_kernel_size>
attention_type: RelPosMHAXL
normalize_before: True
causal: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <beam_size>
ctc_weight: !ref <ctc_weight_decode>
temperature: 1.15
temperature_lm: 1.15
using_eos_threshold: False
length_normalization: True
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
win_length: !ref <win_length>
n_mels: !ref <n_mels>
tokenizer: !new:sentencepiece.SentencePieceProcessor
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
modules:
pre_transformer: !ref <CNN>
transformer: !ref <Transformer>
seq_lin: !ref <seq_lin>
ctc_lin: !ref <ctc_lin>
normalizer: !ref <normalizer>
encoder: !ref <encoder>
compute_features: !ref <compute_features>
model: !ref <model>
decoder: !ref <decoder>
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
model: !ref <model>
tokenizer: !ref <tokenizer> |