ddwkim commited on
Commit
d3065b1
1 Parent(s): 001b62f

Add normalizer tokenizer

Browse files
Files changed (3) hide show
  1. hyperparams.yaml +164 -0
  2. normalizer.ckpt +3 -0
  3. tokenizer.ckpt +3 -0
hyperparams.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with Transformer
3
+ # Encoder: Conformer Encoder (small)
4
+ # Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
5
+ # Tokens: unigram
6
+ # losses: CTC + KLdiv (Label Smoothing loss)
7
+ # Training: KsponSpeech 965.2h
8
+ # Based on the works of: Jianyuan Zhong, Titouan Parcollet 2021
9
+ # Authors: Dong Won Kim, Dongwoo Kim 2021, 2024
10
+ # ############################################################################
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ # ############################################################################
13
+
14
+ # Feature parameters
15
+ sample_rate: 16000
16
+ n_fft: 512
17
+ n_mels: 80
18
+
19
+ ####################### Model parameters ###########################
20
+ # Transformer
21
+ d_model: 144
22
+ nhead: 4
23
+ num_encoder_layers: 12
24
+ num_decoder_layers: 4
25
+ d_ffn: 1024
26
+ transformer_dropout: 0.1
27
+ activation: !name:torch.nn.GELU
28
+ output_neurons: 5000
29
+
30
+ # Outputs
31
+ blank_index: 0
32
+ label_smoothing: 0.1
33
+ pad_index: 0
34
+ bos_index: 1
35
+ eos_index: 2
36
+
37
+ # Decoding parameters
38
+ min_decode_ratio: 0.0
39
+ max_decode_ratio: 1.0
40
+ test_beam_size: 66
41
+ lm_weight: 0.60
42
+ ctc_weight_decode: 0.40
43
+
44
+ ############################## models ################################
45
+
46
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
47
+ input_shape: (8, 10, 80)
48
+ num_blocks: 2
49
+ num_layers_per_block: 1
50
+ out_channels: (64, 32)
51
+ kernel_sizes: (3, 3)
52
+ strides: (2, 2)
53
+ residuals: (False, False)
54
+
55
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
56
+ input_size: 640
57
+ tgt_vocab: !ref <output_neurons>
58
+ d_model: !ref <d_model>
59
+ nhead: !ref <nhead>
60
+ num_encoder_layers: !ref <num_encoder_layers>
61
+ num_decoder_layers: !ref <num_decoder_layers>
62
+ d_ffn: !ref <d_ffn>
63
+ activation: !ref <activation>
64
+ encoder_module: conformer
65
+ attention_type: RelPosMHAXL
66
+ normalize_before: True
67
+ causal: False
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
78
+ language_model: !ref <lm_model>
79
+ temperature: 1.15
80
+
81
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
82
+ eos_index: !ref <eos_index>
83
+ blank_index: !ref <blank_index>
84
+ ctc_fc: !ref <ctc_lin>
85
+
86
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
87
+ full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
88
+ weights:
89
+ transformerlm: !ref <lm_weight>
90
+ ctc: !ref <ctc_weight_decode>
91
+
92
+
93
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
94
+ modules: [!ref <Transformer>, !ref <seq_lin>]
95
+ bos_index: !ref <bos_index>
96
+ eos_index: !ref <eos_index>
97
+ min_decode_ratio: !ref <min_decode_ratio>
98
+ max_decode_ratio: !ref <max_decode_ratio>
99
+ beam_size: !ref <test_beam_size>
100
+ temperature: 1.15
101
+ using_eos_threshold: False
102
+ length_normalization: True
103
+ scorer: !ref <scorer>
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+ normalizer: !new:speechbrain.processing.features.InputNormalization
109
+ norm_type: global
110
+
111
+ compute_features: !new:speechbrain.lobes.features.Fbank
112
+ sample_rate: !ref <sample_rate>
113
+ n_fft: !ref <n_fft>
114
+ n_mels: !ref <n_mels>
115
+
116
+ # This is the Transformer LM that is used according to the Huggingface repository
117
+ # Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
118
+ # For more details about the model!
119
+ # NB: It has to match the pre-trained TransformerLM!!
120
+ lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
121
+ vocab: 5000
122
+ d_model: 768
123
+ nhead: 12
124
+ num_encoder_layers: 12
125
+ num_decoder_layers: 0
126
+ d_ffn: 3072
127
+ dropout: 0.0
128
+ activation: !name:torch.nn.GELU
129
+ normalize_before: False
130
+
131
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
132
+
133
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
134
+ transformer: !ref <Transformer>
135
+
136
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
137
+ input_shape: [null, null, !ref <n_mels>]
138
+ compute_features: !ref <compute_features>
139
+ normalize: !ref <normalizer>
140
+ cnn: !ref <CNN>
141
+ transformer_encoder: !ref <Tencoder>
142
+
143
+ # Models
144
+ asr_model: !new:torch.nn.ModuleList
145
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
146
+
147
+ modules:
148
+ compute_features: !ref <compute_features>
149
+ normalizer: !ref <normalizer>
150
+ pre_transformer: !ref <CNN>
151
+ transformer: !ref <Transformer>
152
+ asr_model: !ref <asr_model>
153
+ lm_model: !ref <lm_model>
154
+ encoder: !ref <encoder>
155
+ decoder: !ref <decoder>
156
+
157
+ # The pretrainer allows a mapping between pretrained files and instances that
158
+ # are declared in the yaml.
159
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
160
+ loadables:
161
+ normalizer: !ref <normalizer>
162
+ asr: !ref <asr_model>
163
+ lm: !ref <lm_model>
164
+ tokenizer: !ref <tokenizer>
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bc96f5ca3926f21a055ccc21815446da200e871d3613ba20dad1a1811feb1c0
3
+ size 2218
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d419e55734c26df6c5690671be2b887a7db389c1a7f63286111ce737508c6569
3
+ size 313900