File size: 4,322 Bytes
b67e798
 
 
 
 
 
 
 
 
 
 
 
 
 
8f98994
 
b67e798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a3b316
 
 
 
 
 
 
 
 
 
cfaa032
4a3b316
b67e798
 
 
 
a22900d
b67e798
 
 
4a3b316
b67e798
 
 
 
8f98994
b67e798
 
 
 
 
 
 
 
 
 
 
8f98994
 
 
 
 
 
 
 
 
b67e798
 
 
 
8f98994
 
 
 
 
 
 
 
 
 
 
b67e798
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Tedlium2
# Authors:  Adel Moumen 2023
# ############################################################################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
win_length: 25
n_time_mask: 7

####################### Model parameters ###########################
# Transformer
d_model: 512
nhead: 8
num_encoder_layers: 18
num_decoder_layers: 6
csgu_linear_units: 3072
csgu_kernel_size: 31
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 500

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 20
ctc_weight_decode: 0.3

############################## models ################################

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
    input_shape: (8, 10, 80)
    num_blocks: 2
    num_layers_per_block: 1
    out_channels: (64, 32)
    kernel_sizes: (3, 3)
    strides: (2, 2)
    residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
    input_size: 640
    tgt_vocab: !ref <output_neurons>
    d_model: !ref <d_model>
    nhead: !ref <nhead>
    num_encoder_layers: !ref <num_encoder_layers>
    num_decoder_layers: !ref <num_decoder_layers>
    dropout: !ref <transformer_dropout>
    activation: !ref <activation>
    branchformer_activation: !ref <activation>
    encoder_module: branchformer
    csgu_linear_units: !ref <csgu_linear_units>
    kernel_size: !ref <csgu_kernel_size>
    attention_type: RelPosMHAXL
    normalize_before: True
    causal: False

ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
    eos_index: !ref <eos_index>
    blank_index: !ref <blank_index>
    ctc_fc: !ref <ctc_lin>
    
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
    full_scorers: [!ref <ctc_scorer>]
    weights:
        ctc: !ref <ctc_weight_decode>
        
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
    modules: [!ref <Transformer>, !ref <seq_lin>]
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <beam_size>
    temperature: 1.15
    using_eos_threshold: False
    length_normalization: True
    scorer: !ref <scorer>

log_softmax: !new:torch.nn.LogSoftmax
    dim: -1

normalizer: !new:speechbrain.processing.features.InputNormalization
    norm_type: global
    update_until_epoch: 4

compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    win_length: !ref <win_length>
    n_mels: !ref <n_mels>

tokenizer: !new:sentencepiece.SentencePieceProcessor

Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
    transformer: !ref <Transformer>

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    cnn: !ref <CNN>
    transformer_encoder: !ref <Tencoder>

model: !new:torch.nn.ModuleList
    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]

modules:
    pre_transformer: !ref <CNN>
    transformer: !ref <Transformer>
    seq_lin: !ref <seq_lin>
    ctc_lin: !ref <ctc_lin>
    normalizer: !ref <normalizer>
    encoder: !ref <encoder>
    compute_features: !ref <compute_features>
    model: !ref <model>
    decoder: !ref <decoder>

# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      normalizer: !ref <normalizer>
      model: !ref <model>
      tokenizer: !ref <tokenizer>