Text-to-Speech
ESPnet
English
audio
GunnarThor commited on
Commit
b4da82b
1 Parent(s): 48a6d04

Update model

Browse files
Files changed (30) hide show
  1. README.md +378 -1
  2. dump/xvector/dev_e/spk_xvector.ark +0 -0
  3. dump/xvector/dev_e/spk_xvector.scp +1 -0
  4. dump/xvector/eval1_e/spk_xvector.ark +0 -0
  5. dump/xvector/eval1_e/spk_xvector.scp +1 -0
  6. dump/xvector/train_e/spk_xvector.ark +0 -0
  7. dump/xvector/train_e/spk_xvector.scp +1 -0
  8. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/config.yaml +297 -0
  9. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/backward_time.png +0 -0
  10. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/clip.png +0 -0
  11. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/decoder_alpha.png +0 -0
  12. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/duration_loss.png +0 -0
  13. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/encoder_alpha.png +0 -0
  14. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/energy_loss.png +0 -0
  15. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/forward_time.png +0 -0
  16. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/gpu_max_cached_mem_GB.png +0 -0
  17. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/grad_norm.png +0 -0
  18. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/iter_time.png +0 -0
  19. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/l1_loss.png +0 -0
  20. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/loss.png +0 -0
  21. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/loss_scale.png +0 -0
  22. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/optim0_lr0.png +0 -0
  23. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/optim_step_time.png +0 -0
  24. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/pitch_loss.png +0 -0
  25. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/train_time.png +0 -0
  26. exp/tts_finetune_e_loudnorm_xvector_fastspeech2/valid.loss.ave_5best.pth +3 -0
  27. exp/tts_stats_e/train/energy_stats.npz +3 -0
  28. exp/tts_stats_e/train/feats_stats.npz +3 -0
  29. exp/tts_stats_e/train/pitch_stats.npz +3 -0
  30. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,380 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - talromur
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `language-and-voice-lab/talromur_e_loudnorm_xvector_finetune_fastspeech2`
15
+
16
+ This model was trained by G-Thor using talromur recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout d0047402e830a3c53e8b590064af4bf70415fb3b
26
+ pip install -e .
27
+ cd egs2/talromur/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model language-and-voice-lab/talromur_e_loudnorm_xvector_finetune_fastspeech2
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/finetune_xvector_fastspeech2.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/tts_finetune_e_loudnorm_xvector_fastspeech2
46
+ ngpu: 1
47
+ seed: 0
48
+ num_workers: 1
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: false
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: true
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 50
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - valid
77
+ - loss
78
+ - min
79
+ - - train
80
+ - loss
81
+ - min
82
+ keep_nbest_models: 5
83
+ nbest_averaging_interval: 0
84
+ grad_clip: 1.0
85
+ grad_clip_type: 2.0
86
+ grad_noise: false
87
+ accum_grad: 8
88
+ no_forward_run: false
89
+ resume: true
90
+ train_dtype: float32
91
+ use_amp: false
92
+ log_interval: null
93
+ use_matplotlib: true
94
+ use_tensorboard: true
95
+ create_graph_in_tensorboard: false
96
+ use_wandb: false
97
+ wandb_project: null
98
+ wandb_id: null
99
+ wandb_entity: null
100
+ wandb_name: null
101
+ wandb_model_log_interval: -1
102
+ detect_anomaly: false
103
+ use_adapter: false
104
+ adapter: lora
105
+ save_strategy: all
106
+ adapter_conf: {}
107
+ pretrain_path: null
108
+ init_param:
109
+ - /users/home/gunnaro/talromur_1and2_spk_avg_xvector_fastspeech2/exp/tts_xvector_fastspeech2_spk_avg_combined/valid.loss.ave_5best.pth:tts:tts
110
+ ignore_init_mismatch: false
111
+ freeze_param: []
112
+ num_iters_per_epoch: 800
113
+ batch_size: 20
114
+ valid_batch_size: null
115
+ batch_bins: 4500000
116
+ valid_batch_bins: null
117
+ train_shape_file:
118
+ - exp/tts_stats_e/train/text_shape.phn
119
+ - exp/tts_stats_e/train/speech_shape
120
+ valid_shape_file:
121
+ - exp/tts_stats_e/valid/text_shape.phn
122
+ - exp/tts_stats_e/valid/speech_shape
123
+ batch_type: numel
124
+ valid_batch_type: null
125
+ fold_length:
126
+ - 150
127
+ - 204800
128
+ sort_in_batch: descending
129
+ shuffle_within_batch: false
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ chunk_length: 500
133
+ chunk_shift_ratio: 0.5
134
+ num_cache_chunks: 1024
135
+ chunk_excluded_key_prefixes: []
136
+ chunk_default_fs: null
137
+ train_data_path_and_name_and_type:
138
+ - - dump/raw/train_e/text
139
+ - text
140
+ - text
141
+ - - data/train_e/durations
142
+ - durations
143
+ - text_int
144
+ - - dump/raw/train_e/wav.scp
145
+ - speech
146
+ - sound
147
+ - - dump/xvector/train_e/xvector.scp
148
+ - spembs
149
+ - kaldi_ark
150
+ valid_data_path_and_name_and_type:
151
+ - - dump/raw/dev_e/text
152
+ - text
153
+ - text
154
+ - - data/dev_e/durations
155
+ - durations
156
+ - text_int
157
+ - - dump/raw/dev_e/wav.scp
158
+ - speech
159
+ - sound
160
+ - - dump/xvector/dev_e/xvector.scp
161
+ - spembs
162
+ - kaldi_ark
163
+ allow_variable_data_keys: false
164
+ max_cache_size: 0.0
165
+ max_cache_fd: 32
166
+ allow_multi_rates: false
167
+ valid_max_cache_size: null
168
+ exclude_weight_decay: false
169
+ exclude_weight_decay_conf: {}
170
+ optim: adam
171
+ optim_conf:
172
+ lr: 0.1
173
+ scheduler: noamlr
174
+ scheduler_conf:
175
+ model_size: 384
176
+ warmup_steps: 4000
177
+ token_list:
178
+ - <blank>
179
+ - <unk>
180
+ - a
181
+ - r
182
+ - sil
183
+ - I
184
+ - t
185
+ - n
186
+ - s
187
+ - D
188
+ - Y
189
+ - E
190
+ - l
191
+ - v
192
+ - m
193
+ - h
194
+ - k
195
+ - j
196
+ - G
197
+ - T
198
+ - f
199
+ - p
200
+ - 'E:'
201
+ - c
202
+ - i
203
+ - 'au:'
204
+ - 'O:'
205
+ - 'a:'
206
+ - ei
207
+ - 'i:'
208
+ - r_0
209
+ - t_h
210
+ - O
211
+ - k_h
212
+ - ou
213
+ - ai
214
+ - '9'
215
+ - au
216
+ - 'I:'
217
+ - 'ou:'
218
+ - u
219
+ - 'ei:'
220
+ - N
221
+ - l_0
222
+ - 'u:'
223
+ - n_0
224
+ - '9:'
225
+ - 'ai:'
226
+ - 9i
227
+ - c_h
228
+ - p_h
229
+ - x
230
+ - C
231
+ - '9i:'
232
+ - 'Y:'
233
+ - J
234
+ - N_0
235
+ - m_0
236
+ - Oi
237
+ - Yi
238
+ - J_0
239
+ - spn
240
+ - '1'
241
+ - '7'
242
+ - <sos/eos>
243
+ odim: null
244
+ model_conf: {}
245
+ use_preprocessor: true
246
+ token_type: phn
247
+ bpemodel: null
248
+ non_linguistic_symbols: null
249
+ cleaner: null
250
+ g2p: null
251
+ feats_extract: fbank
252
+ feats_extract_conf:
253
+ n_fft: 1024
254
+ hop_length: 256
255
+ win_length: null
256
+ fs: 22050
257
+ fmin: 80
258
+ fmax: 7600
259
+ n_mels: 80
260
+ normalize: global_mvn
261
+ normalize_conf:
262
+ stats_file: exp/tts_stats_e/train/feats_stats.npz
263
+ tts: fastspeech2
264
+ tts_conf:
265
+ adim: 384
266
+ aheads: 2
267
+ elayers: 4
268
+ eunits: 1536
269
+ dlayers: 4
270
+ dunits: 1536
271
+ positionwise_layer_type: conv1d
272
+ positionwise_conv_kernel_size: 3
273
+ duration_predictor_layers: 2
274
+ duration_predictor_chans: 256
275
+ duration_predictor_kernel_size: 3
276
+ postnet_layers: 5
277
+ postnet_filts: 5
278
+ postnet_chans: 256
279
+ use_masking: true
280
+ use_scaled_pos_enc: true
281
+ encoder_normalize_before: true
282
+ decoder_normalize_before: true
283
+ reduction_factor: 1
284
+ init_type: xavier_uniform
285
+ init_enc_alpha: 1.0
286
+ init_dec_alpha: 1.0
287
+ transformer_enc_dropout_rate: 0.2
288
+ transformer_enc_positional_dropout_rate: 0.2
289
+ transformer_enc_attn_dropout_rate: 0.2
290
+ transformer_dec_dropout_rate: 0.2
291
+ transformer_dec_positional_dropout_rate: 0.2
292
+ transformer_dec_attn_dropout_rate: 0.2
293
+ pitch_predictor_layers: 5
294
+ pitch_predictor_chans: 256
295
+ pitch_predictor_kernel_size: 5
296
+ pitch_predictor_dropout: 0.5
297
+ pitch_embed_kernel_size: 1
298
+ pitch_embed_dropout: 0.0
299
+ stop_gradient_from_pitch_predictor: true
300
+ energy_predictor_layers: 2
301
+ energy_predictor_chans: 256
302
+ energy_predictor_kernel_size: 3
303
+ energy_predictor_dropout: 0.5
304
+ energy_embed_kernel_size: 1
305
+ energy_embed_dropout: 0.0
306
+ stop_gradient_from_energy_predictor: false
307
+ spk_embed_dim: 512
308
+ spk_embed_integration_type: add
309
+ pitch_extract: dio
310
+ pitch_extract_conf:
311
+ fs: 22050
312
+ n_fft: 1024
313
+ hop_length: 256
314
+ f0max: 400
315
+ f0min: 80
316
+ reduction_factor: 1
317
+ pitch_normalize: global_mvn
318
+ pitch_normalize_conf:
319
+ stats_file: exp/tts_stats_e/train/pitch_stats.npz
320
+ energy_extract: energy
321
+ energy_extract_conf:
322
+ fs: 22050
323
+ n_fft: 1024
324
+ hop_length: 256
325
+ win_length: null
326
+ reduction_factor: 1
327
+ energy_normalize: global_mvn
328
+ energy_normalize_conf:
329
+ stats_file: exp/tts_stats_e/train/energy_stats.npz
330
+ required:
331
+ - output_dir
332
+ - token_list
333
+ version: '202402'
334
+ distributed: false
335
+ ```
336
+
337
+ </details>
338
+
339
+
340
+
341
+ ### Citing ESPnet
342
+
343
+ ```BibTex
344
+ @inproceedings{watanabe2018espnet,
345
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
346
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
347
+ year={2018},
348
+ booktitle={Proceedings of Interspeech},
349
+ pages={2207--2211},
350
+ doi={10.21437/Interspeech.2018-1456},
351
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
352
+ }
353
+
354
+
355
+
356
+
357
+ @inproceedings{hayashi2020espnet,
358
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
359
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
360
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
361
+ pages={7654--7658},
362
+ year={2020},
363
+ organization={IEEE}
364
+ }
365
+
366
+
367
+ ```
368
+
369
+ or arXiv:
370
+
371
+ ```bibtex
372
+ @misc{watanabe2018espnet,
373
+ title={ESPnet: End-to-End Speech Processing Toolkit},
374
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
375
+ year={2018},
376
+ eprint={1804.00015},
377
+ archivePrefix={arXiv},
378
+ primaryClass={cs.CL}
379
+ }
380
+ ```
dump/xvector/dev_e/spk_xvector.ark ADDED
Binary file (2.06 kB). View file
 
dump/xvector/dev_e/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ e dump/xvector/dev_e/spk_xvector.ark:2
dump/xvector/eval1_e/spk_xvector.ark ADDED
Binary file (2.06 kB). View file
 
dump/xvector/eval1_e/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ e dump/xvector/eval1_e/spk_xvector.ark:2
dump/xvector/train_e/spk_xvector.ark ADDED
Binary file (2.06 kB). View file
 
dump/xvector/train_e/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ e dump/xvector/train_e/spk_xvector.ark:2
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/config.yaml ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_xvector_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/tts_finetune_e_loudnorm_xvector_fastspeech2
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 1
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 50
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ - - train
43
+ - loss
44
+ - min
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 8
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_adapter: false
67
+ adapter: lora
68
+ save_strategy: all
69
+ adapter_conf: {}
70
+ pretrain_path: null
71
+ init_param:
72
+ - /users/home/gunnaro/talromur_1and2_spk_avg_xvector_fastspeech2/exp/tts_xvector_fastspeech2_spk_avg_combined/valid.loss.ave_5best.pth:tts:tts
73
+ ignore_init_mismatch: false
74
+ freeze_param: []
75
+ num_iters_per_epoch: 800
76
+ batch_size: 20
77
+ valid_batch_size: null
78
+ batch_bins: 4500000
79
+ valid_batch_bins: null
80
+ train_shape_file:
81
+ - exp/tts_stats_e/train/text_shape.phn
82
+ - exp/tts_stats_e/train/speech_shape
83
+ valid_shape_file:
84
+ - exp/tts_stats_e/valid/text_shape.phn
85
+ - exp/tts_stats_e/valid/speech_shape
86
+ batch_type: numel
87
+ valid_batch_type: null
88
+ fold_length:
89
+ - 150
90
+ - 204800
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 500
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ train_data_path_and_name_and_type:
101
+ - - dump/raw/train_e/text
102
+ - text
103
+ - text
104
+ - - data/train_e/durations
105
+ - durations
106
+ - text_int
107
+ - - dump/raw/train_e/wav.scp
108
+ - speech
109
+ - sound
110
+ - - dump/xvector/train_e/xvector.scp
111
+ - spembs
112
+ - kaldi_ark
113
+ valid_data_path_and_name_and_type:
114
+ - - dump/raw/dev_e/text
115
+ - text
116
+ - text
117
+ - - data/dev_e/durations
118
+ - durations
119
+ - text_int
120
+ - - dump/raw/dev_e/wav.scp
121
+ - speech
122
+ - sound
123
+ - - dump/xvector/dev_e/xvector.scp
124
+ - spembs
125
+ - kaldi_ark
126
+ allow_variable_data_keys: false
127
+ max_cache_size: 0.0
128
+ max_cache_fd: 32
129
+ allow_multi_rates: false
130
+ valid_max_cache_size: null
131
+ exclude_weight_decay: false
132
+ exclude_weight_decay_conf: {}
133
+ optim: adam
134
+ optim_conf:
135
+ lr: 0.1
136
+ scheduler: noamlr
137
+ scheduler_conf:
138
+ model_size: 384
139
+ warmup_steps: 4000
140
+ token_list:
141
+ - <blank>
142
+ - <unk>
143
+ - a
144
+ - r
145
+ - sil
146
+ - I
147
+ - t
148
+ - n
149
+ - s
150
+ - D
151
+ - Y
152
+ - E
153
+ - l
154
+ - v
155
+ - m
156
+ - h
157
+ - k
158
+ - j
159
+ - G
160
+ - T
161
+ - f
162
+ - p
163
+ - 'E:'
164
+ - c
165
+ - i
166
+ - 'au:'
167
+ - 'O:'
168
+ - 'a:'
169
+ - ei
170
+ - 'i:'
171
+ - r_0
172
+ - t_h
173
+ - O
174
+ - k_h
175
+ - ou
176
+ - ai
177
+ - '9'
178
+ - au
179
+ - 'I:'
180
+ - 'ou:'
181
+ - u
182
+ - 'ei:'
183
+ - N
184
+ - l_0
185
+ - 'u:'
186
+ - n_0
187
+ - '9:'
188
+ - 'ai:'
189
+ - 9i
190
+ - c_h
191
+ - p_h
192
+ - x
193
+ - C
194
+ - '9i:'
195
+ - 'Y:'
196
+ - J
197
+ - N_0
198
+ - m_0
199
+ - Oi
200
+ - Yi
201
+ - J_0
202
+ - spn
203
+ - '1'
204
+ - '7'
205
+ - <sos/eos>
206
+ odim: null
207
+ model_conf: {}
208
+ use_preprocessor: true
209
+ token_type: phn
210
+ bpemodel: null
211
+ non_linguistic_symbols: null
212
+ cleaner: null
213
+ g2p: null
214
+ feats_extract: fbank
215
+ feats_extract_conf:
216
+ n_fft: 1024
217
+ hop_length: 256
218
+ win_length: null
219
+ fs: 22050
220
+ fmin: 80
221
+ fmax: 7600
222
+ n_mels: 80
223
+ normalize: global_mvn
224
+ normalize_conf:
225
+ stats_file: exp/tts_stats_e/train/feats_stats.npz
226
+ tts: fastspeech2
227
+ tts_conf:
228
+ adim: 384
229
+ aheads: 2
230
+ elayers: 4
231
+ eunits: 1536
232
+ dlayers: 4
233
+ dunits: 1536
234
+ positionwise_layer_type: conv1d
235
+ positionwise_conv_kernel_size: 3
236
+ duration_predictor_layers: 2
237
+ duration_predictor_chans: 256
238
+ duration_predictor_kernel_size: 3
239
+ postnet_layers: 5
240
+ postnet_filts: 5
241
+ postnet_chans: 256
242
+ use_masking: true
243
+ use_scaled_pos_enc: true
244
+ encoder_normalize_before: true
245
+ decoder_normalize_before: true
246
+ reduction_factor: 1
247
+ init_type: xavier_uniform
248
+ init_enc_alpha: 1.0
249
+ init_dec_alpha: 1.0
250
+ transformer_enc_dropout_rate: 0.2
251
+ transformer_enc_positional_dropout_rate: 0.2
252
+ transformer_enc_attn_dropout_rate: 0.2
253
+ transformer_dec_dropout_rate: 0.2
254
+ transformer_dec_positional_dropout_rate: 0.2
255
+ transformer_dec_attn_dropout_rate: 0.2
256
+ pitch_predictor_layers: 5
257
+ pitch_predictor_chans: 256
258
+ pitch_predictor_kernel_size: 5
259
+ pitch_predictor_dropout: 0.5
260
+ pitch_embed_kernel_size: 1
261
+ pitch_embed_dropout: 0.0
262
+ stop_gradient_from_pitch_predictor: true
263
+ energy_predictor_layers: 2
264
+ energy_predictor_chans: 256
265
+ energy_predictor_kernel_size: 3
266
+ energy_predictor_dropout: 0.5
267
+ energy_embed_kernel_size: 1
268
+ energy_embed_dropout: 0.0
269
+ stop_gradient_from_energy_predictor: false
270
+ spk_embed_dim: 512
271
+ spk_embed_integration_type: add
272
+ pitch_extract: dio
273
+ pitch_extract_conf:
274
+ fs: 22050
275
+ n_fft: 1024
276
+ hop_length: 256
277
+ f0max: 400
278
+ f0min: 80
279
+ reduction_factor: 1
280
+ pitch_normalize: global_mvn
281
+ pitch_normalize_conf:
282
+ stats_file: exp/tts_stats_e/train/pitch_stats.npz
283
+ energy_extract: energy
284
+ energy_extract_conf:
285
+ fs: 22050
286
+ n_fft: 1024
287
+ hop_length: 256
288
+ win_length: null
289
+ reduction_factor: 1
290
+ energy_normalize: global_mvn
291
+ energy_normalize_conf:
292
+ stats_file: exp/tts_stats_e/train/energy_stats.npz
293
+ required:
294
+ - output_dir
295
+ - token_list
296
+ version: '202402'
297
+ distributed: false
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/backward_time.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/clip.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/decoder_alpha.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/duration_loss.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/encoder_alpha.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/energy_loss.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/forward_time.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/grad_norm.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/iter_time.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/l1_loss.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/loss.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/loss_scale.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/optim0_lr0.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/optim_step_time.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/pitch_loss.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/images/train_time.png ADDED
exp/tts_finetune_e_loudnorm_xvector_fastspeech2/valid.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46a25e6e7093652b40a5f27da44fea982b5de1dfe1f8119d4590cb3260cc6b36
3
+ size 149494448
exp/tts_stats_e/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b85da3e318641f685856de520e847c9b8d0bbdca6ccf96e359fc1a784e66ff
3
+ size 770
exp/tts_stats_e/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef97f24a1738c9454a9f2aa2a5656d30e1eb78347bcd236d971d7c3d16c3f6f
3
+ size 1402
exp/tts_stats_e/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502cd856dad51363acad657b79cc6a3f701b56ca7db108656d3b1245c4f108e6
3
+ size 770
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/tts_finetune_e_loudnorm_xvector_fastspeech2/valid.loss.ave_5best.pth
4
+ python: "3.9.18 (main, Sep 11 2023, 13:41:44) \n[GCC 11.2.0]"
5
+ timestamp: 1711125478.837992
6
+ torch: 2.1.0
7
+ yaml_files:
8
+ train_config: exp/tts_finetune_e_loudnorm_xvector_fastspeech2/config.yaml