Jiyang Tang commited on
Commit
9fee4c6
·
1 Parent(s): faa597d

Update model

Browse files
README.md ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - jtang1
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `tjysdsg/11692_cyclic_asr_tts_gumbel_softmax_init`
15
+
16
+ This model was trained by Jiyang Tang using jtang1 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 255e590f1449bf8c7e2297e6bbda6063ab703caa
23
+ pip install -e .
24
+ cd cis210027p/jtang1/vits_cyclic
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model tjysdsg/11692_cyclic_asr_tts_gumbel_softmax_init
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_vits_unpaired_gumbel.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/tts_tmp
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: null
48
+ dist_rank: null
49
+ local_rank: 0
50
+ dist_master_addr: null
51
+ dist_master_port: null
52
+ dist_launcher: null
53
+ multiprocessing_distributed: false
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: false
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 1
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - valid
75
+ - acc_asr
76
+ - max
77
+ - - train
78
+ - loss
79
+ - min
80
+ keep_nbest_models: 5
81
+ nbest_averaging_interval: 0
82
+ grad_clip: 1.0
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 1
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: 50
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ pretrain_path: null
101
+ init_param:
102
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:encoder:asr_encoder
103
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:decoder:asr_decoder
104
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:ctc:ctc
105
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:frontend:frontend
106
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:normalize:normalize
107
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.text_encoder:tts.generator.text_encoder
108
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.decoder:tts.generator.decoder
109
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.posterior_encoder:tts.generator.posterior_encoder
110
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.flow:tts.generator.flow
111
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.duration_predictor:tts.generator.duration_predictor
112
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.discriminator.msd:tts.discriminator.msd
113
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.discriminator.mpd:tts.discriminator.mpd
114
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.mel_loss.wav_to_mel:tts.mel_loss.wav_to_mel
115
+ ignore_init_mismatch: false
116
+ freeze_param:
117
+ - tts
118
+ num_iters_per_epoch: 1
119
+ batch_size: 20
120
+ valid_batch_size: null
121
+ batch_bins: 1000000
122
+ valid_batch_bins: null
123
+ train_shape_file:
124
+ - exp/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
125
+ - exp/tts_stats_raw_linear_spectrogram_char/train/speech_shape
126
+ - exp/tts_stats_raw_linear_spectrogram_char/train/sudo_text_shape.char
127
+ valid_shape_file:
128
+ - exp/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
129
+ - exp/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
130
+ batch_type: numel
131
+ valid_batch_type: null
132
+ fold_length:
133
+ - 150
134
+ - 204800
135
+ sort_in_batch: descending
136
+ sort_batch: descending
137
+ multiple_iterator: false
138
+ chunk_length: 500
139
+ chunk_shift_ratio: 0.5
140
+ num_cache_chunks: 1024
141
+ train_data_path_and_name_and_type:
142
+ - - dump/raw/train_clean_360/text
143
+ - text
144
+ - text
145
+ - - dump/raw/train_clean_360/wav.scp
146
+ - speech
147
+ - sound
148
+ - - exp/tts_stats_raw_linear_spectrogram_char/train/sudo_text
149
+ - sudo_text
150
+ - text
151
+ - - dump/xvector/train_clean_360/xvector.scp
152
+ - spembs
153
+ - kaldi_ark
154
+ valid_data_path_and_name_and_type:
155
+ - - dump/raw/dev_clean/text
156
+ - text
157
+ - text
158
+ - - dump/raw/dev_clean/wav.scp
159
+ - speech
160
+ - sound
161
+ - - dump/xvector/dev_clean/xvector.scp
162
+ - spembs
163
+ - kaldi_ark
164
+ allow_variable_data_keys: false
165
+ max_cache_size: 0.0
166
+ max_cache_fd: 32
167
+ valid_max_cache_size: null
168
+ optim: adam
169
+ optim_conf:
170
+ lr: 0.0001
171
+ scheduler: warmuplr
172
+ scheduler_conf:
173
+ warmup_steps: 30000
174
+ optim2: adam
175
+ optim2_conf:
176
+ lr: 0.0001
177
+ scheduler2: warmuplr
178
+ scheduler2_conf:
179
+ warmup_steps: 30000
180
+ generator_first: false
181
+ no_discriminator_backprop: true
182
+ token_list:
183
+ - <blank>
184
+ - <unk>
185
+ - <space>
186
+ - E
187
+ - T
188
+ - A
189
+ - O
190
+ - N
191
+ - I
192
+ - H
193
+ - S
194
+ - R
195
+ - D
196
+ - L
197
+ - U
198
+ - M
199
+ - C
200
+ - W
201
+ - F
202
+ - G
203
+ - Y
204
+ - P
205
+ - B
206
+ - V
207
+ - K
208
+ - ''''
209
+ - X
210
+ - J
211
+ - Q
212
+ - Z
213
+ - <sos/eos>
214
+ odim: null
215
+ model_conf:
216
+ mtlalpha: 1.0
217
+ mt_weight: 0.0
218
+ asr_weight: 0.5
219
+ lsm_weight: 0.1
220
+ length_normalized_loss: true
221
+ use_unpaired: true
222
+ asr_normalize: true
223
+ gumbel_softmax: true
224
+ use_preprocessor: true
225
+ token_type: char
226
+ bpemodel: null
227
+ non_linguistic_symbols: null
228
+ cleaner: null
229
+ g2p: null
230
+ use_multidecoder: true
231
+ speech_attn: false
232
+ ctc_conf:
233
+ dropout_rate: 0.0
234
+ ctc_type: builtin
235
+ reduce: true
236
+ ignore_nan_grad: true
237
+ zero_infinity: true
238
+ feats_extract: linear_spectrogram
239
+ feats_extract_conf:
240
+ n_fft: 1024
241
+ hop_length: 256
242
+ win_length: null
243
+ normalize: global_mvn
244
+ normalize_conf:
245
+ stats_file: exp/tts_stats_raw_linear_spectrogram_char/train/feats_stats.npz
246
+ tts: vits
247
+ tts_conf:
248
+ generator_type: vits_generator
249
+ generator_params:
250
+ hidden_channels: 256
251
+ spks: -1
252
+ spk_embed_dim: 512
253
+ global_channels: 256
254
+ segment_size: 32
255
+ text_encoder_attention_heads: 2
256
+ text_encoder_ffn_expand: 4
257
+ text_encoder_blocks: 6
258
+ text_encoder_positionwise_layer_type: conv1d
259
+ text_encoder_positionwise_conv_kernel_size: 3
260
+ text_encoder_positional_encoding_layer_type: rel_pos
261
+ text_encoder_self_attention_layer_type: rel_selfattn
262
+ text_encoder_activation_type: swish
263
+ text_encoder_normalize_before: true
264
+ text_encoder_dropout_rate: 0.1
265
+ text_encoder_positional_dropout_rate: 0.0
266
+ text_encoder_attention_dropout_rate: 0.1
267
+ use_macaron_style_in_text_encoder: true
268
+ use_conformer_conv_in_text_encoder: false
269
+ text_encoder_conformer_kernel_size: -1
270
+ decoder_kernel_size: 7
271
+ decoder_channels: 512
272
+ decoder_upsample_scales:
273
+ - 8
274
+ - 8
275
+ - 2
276
+ - 2
277
+ decoder_upsample_kernel_sizes:
278
+ - 16
279
+ - 16
280
+ - 4
281
+ - 4
282
+ decoder_resblock_kernel_sizes:
283
+ - 3
284
+ - 7
285
+ - 11
286
+ decoder_resblock_dilations:
287
+ - - 1
288
+ - 3
289
+ - 5
290
+ - - 1
291
+ - 3
292
+ - 5
293
+ - - 1
294
+ - 3
295
+ - 5
296
+ use_weight_norm_in_decoder: true
297
+ posterior_encoder_kernel_size: 5
298
+ posterior_encoder_layers: 16
299
+ posterior_encoder_stacks: 1
300
+ posterior_encoder_base_dilation: 1
301
+ posterior_encoder_dropout_rate: 0.0
302
+ use_weight_norm_in_posterior_encoder: true
303
+ flow_flows: 4
304
+ flow_kernel_size: 5
305
+ flow_base_dilation: 1
306
+ flow_layers: 4
307
+ flow_dropout_rate: 0.0
308
+ use_weight_norm_in_flow: true
309
+ use_only_mean_in_flow: true
310
+ stochastic_duration_predictor_kernel_size: 3
311
+ stochastic_duration_predictor_dropout_rate: 0.5
312
+ stochastic_duration_predictor_flows: 4
313
+ stochastic_duration_predictor_dds_conv_layers: 3
314
+ vocabs: 31
315
+ aux_channels: 513
316
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
317
+ discriminator_params:
318
+ scales: 1
319
+ scale_downsample_pooling: AvgPool1d
320
+ scale_downsample_pooling_params:
321
+ kernel_size: 4
322
+ stride: 2
323
+ padding: 2
324
+ scale_discriminator_params:
325
+ in_channels: 1
326
+ out_channels: 1
327
+ kernel_sizes:
328
+ - 15
329
+ - 41
330
+ - 5
331
+ - 3
332
+ channels: 128
333
+ max_downsample_channels: 1024
334
+ max_groups: 16
335
+ bias: true
336
+ downsample_scales:
337
+ - 2
338
+ - 2
339
+ - 4
340
+ - 4
341
+ - 1
342
+ nonlinear_activation: LeakyReLU
343
+ nonlinear_activation_params:
344
+ negative_slope: 0.1
345
+ use_weight_norm: true
346
+ use_spectral_norm: false
347
+ follow_official_norm: false
348
+ periods:
349
+ - 2
350
+ - 3
351
+ - 5
352
+ - 7
353
+ - 11
354
+ period_discriminator_params:
355
+ in_channels: 1
356
+ out_channels: 1
357
+ kernel_sizes:
358
+ - 5
359
+ - 3
360
+ channels: 32
361
+ downsample_scales:
362
+ - 3
363
+ - 3
364
+ - 3
365
+ - 3
366
+ - 1
367
+ max_downsample_channels: 1024
368
+ bias: true
369
+ nonlinear_activation: LeakyReLU
370
+ nonlinear_activation_params:
371
+ negative_slope: 0.1
372
+ use_weight_norm: true
373
+ use_spectral_norm: false
374
+ generator_adv_loss_params:
375
+ average_by_discriminators: false
376
+ loss_type: mse
377
+ discriminator_adv_loss_params:
378
+ average_by_discriminators: false
379
+ loss_type: mse
380
+ feat_match_loss_params:
381
+ average_by_discriminators: false
382
+ average_by_layers: false
383
+ include_final_outputs: true
384
+ mel_loss_params:
385
+ fs: 16000
386
+ n_fft: 1024
387
+ hop_length: 256
388
+ win_length: null
389
+ window: hann
390
+ n_mels: 80
391
+ fmin: 0
392
+ fmax: null
393
+ log_base: null
394
+ lambda_adv: 1.0
395
+ lambda_mel: 45.0
396
+ lambda_feat_match: 2.0
397
+ lambda_dur: 1.0
398
+ lambda_kl: 1.0
399
+ sampling_rate: 16000
400
+ cache_generator_outputs: true
401
+ use_md: true
402
+ skip_text_encoder: false
403
+ gumbel_softmax_input: true
404
+ pitch_extract: null
405
+ pitch_extract_conf: {}
406
+ pitch_normalize: null
407
+ pitch_normalize_conf: {}
408
+ energy_extract: null
409
+ energy_extract_conf: {}
410
+ energy_normalize: null
411
+ energy_normalize_conf: {}
412
+ asr_decoder: transformer
413
+ asr_decoder_conf:
414
+ attention_heads: 4
415
+ linear_units: 2048
416
+ num_blocks: 6
417
+ dropout_rate: 0.1
418
+ positional_dropout_rate: 0.1
419
+ self_attention_dropout_rate: 0.1
420
+ src_attention_dropout_rate: 0.1
421
+ asr_encoder: conformer
422
+ asr_encoder_conf:
423
+ output_size: 256
424
+ attention_heads: 4
425
+ linear_units: 1024
426
+ num_blocks: 12
427
+ dropout_rate: 0.1
428
+ positional_dropout_rate: 0.1
429
+ attention_dropout_rate: 0.1
430
+ input_layer: conv2d
431
+ normalize_before: true
432
+ macaron_style: true
433
+ rel_pos_type: latest
434
+ pos_enc_layer_type: rel_pos
435
+ selfattention_layer_type: rel_selfattn
436
+ activation_type: swish
437
+ use_cnn_module: true
438
+ cnn_module_kernel: 31
439
+ frontend: default
440
+ frontend_conf:
441
+ n_fft: 512
442
+ win_length: 400
443
+ hop_length: 160
444
+ required:
445
+ - output_dir
446
+ - token_list
447
+ version: '202205'
448
+ distributed: false
449
+ ```
450
+
451
+ </details>
452
+
453
+
454
+
455
+ ### Citing ESPnet
456
+
457
+ ```BibTex
458
+ @inproceedings{watanabe2018espnet,
459
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
460
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
461
+ year={2018},
462
+ booktitle={Proceedings of Interspeech},
463
+ pages={2207--2211},
464
+ doi={10.21437/Interspeech.2018-1456},
465
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
466
+ }
467
+
468
+
469
+
470
+
471
+ @inproceedings{hayashi2020espnet,
472
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
473
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
474
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
475
+ pages={7654--7658},
476
+ year={2020},
477
+ organization={IEEE}
478
+ }
479
+ ```
480
+
481
+ or arXiv:
482
+
483
+ ```bibtex
484
+ @misc{watanabe2018espnet,
485
+ title={ESPnet: End-to-End Speech Processing Toolkit},
486
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
487
+ year={2018},
488
+ eprint={1804.00015},
489
+ archivePrefix={arXiv},
490
+ primaryClass={cs.CL}
491
+ }
492
+ ```
exp/tts_stats_raw_linear_spectrogram_char/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847a61e267ad6b228d410f8e750bd7e5b120e836b11c163f499b3c9df5beac1f
3
+ size 1402
exp/tts_tmp/1epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d25741d595a305449fe854a4d46da2de393f7b2e076760a35290ace7e1193f47
3
+ size 599448930
exp/tts_tmp/config.yaml ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_vits_unpaired_gumbel.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_tmp
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - valid
41
+ - acc_asr
42
+ - max
43
+ - - train
44
+ - loss
45
+ - min
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: 1.0
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ pretrain_path: null
67
+ init_param:
68
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:encoder:asr_encoder
69
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:decoder:asr_decoder
70
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:ctc:ctc
71
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:frontend:frontend
72
+ - /ocean/projects/cis210027p/jtang1/espnet/egs2/librispeech_100/asr1/exp/asr_conformer_lr2e-3_warmup15k_amp_nondeterministic/valid.acc.ave.pth:normalize:normalize
73
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.text_encoder:tts.generator.text_encoder
74
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.decoder:tts.generator.decoder
75
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.posterior_encoder:tts.generator.posterior_encoder
76
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.flow:tts.generator.flow
77
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.generator.duration_predictor:tts.generator.duration_predictor
78
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.discriminator.msd:tts.discriminator.msd
79
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.discriminator.mpd:tts.discriminator.mpd
80
+ - /ocean/projects/cis210027p/zzhou5/espnet/egs2/librispeech_100/tts_vits/exp/1024_mel_vits_char/tts_mel_1024_char_lib100_vits_tts_16k_xvector/train.total_count.best.pth:tts.mel_loss.wav_to_mel:tts.mel_loss.wav_to_mel
81
+ ignore_init_mismatch: false
82
+ freeze_param:
83
+ - tts
84
+ num_iters_per_epoch: 1
85
+ batch_size: 20
86
+ valid_batch_size: null
87
+ batch_bins: 1000000
88
+ valid_batch_bins: null
89
+ train_shape_file:
90
+ - exp/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
91
+ - exp/tts_stats_raw_linear_spectrogram_char/train/speech_shape
92
+ - exp/tts_stats_raw_linear_spectrogram_char/train/sudo_text_shape.char
93
+ valid_shape_file:
94
+ - exp/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
95
+ - exp/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
96
+ batch_type: numel
97
+ valid_batch_type: null
98
+ fold_length:
99
+ - 150
100
+ - 204800
101
+ sort_in_batch: descending
102
+ sort_batch: descending
103
+ multiple_iterator: false
104
+ chunk_length: 500
105
+ chunk_shift_ratio: 0.5
106
+ num_cache_chunks: 1024
107
+ train_data_path_and_name_and_type:
108
+ - - dump/raw/train_clean_360/text
109
+ - text
110
+ - text
111
+ - - dump/raw/train_clean_360/wav.scp
112
+ - speech
113
+ - sound
114
+ - - exp/tts_stats_raw_linear_spectrogram_char/train/sudo_text
115
+ - sudo_text
116
+ - text
117
+ - - dump/xvector/train_clean_360/xvector.scp
118
+ - spembs
119
+ - kaldi_ark
120
+ valid_data_path_and_name_and_type:
121
+ - - dump/raw/dev_clean/text
122
+ - text
123
+ - text
124
+ - - dump/raw/dev_clean/wav.scp
125
+ - speech
126
+ - sound
127
+ - - dump/xvector/dev_clean/xvector.scp
128
+ - spembs
129
+ - kaldi_ark
130
+ allow_variable_data_keys: false
131
+ max_cache_size: 0.0
132
+ max_cache_fd: 32
133
+ valid_max_cache_size: null
134
+ optim: adam
135
+ optim_conf:
136
+ lr: 0.0001
137
+ scheduler: warmuplr
138
+ scheduler_conf:
139
+ warmup_steps: 30000
140
+ optim2: adam
141
+ optim2_conf:
142
+ lr: 0.0001
143
+ scheduler2: warmuplr
144
+ scheduler2_conf:
145
+ warmup_steps: 30000
146
+ generator_first: false
147
+ no_discriminator_backprop: true
148
+ token_list:
149
+ - <blank>
150
+ - <unk>
151
+ - <space>
152
+ - E
153
+ - T
154
+ - A
155
+ - O
156
+ - N
157
+ - I
158
+ - H
159
+ - S
160
+ - R
161
+ - D
162
+ - L
163
+ - U
164
+ - M
165
+ - C
166
+ - W
167
+ - F
168
+ - G
169
+ - Y
170
+ - P
171
+ - B
172
+ - V
173
+ - K
174
+ - ''''
175
+ - X
176
+ - J
177
+ - Q
178
+ - Z
179
+ - <sos/eos>
180
+ odim: null
181
+ model_conf:
182
+ mtlalpha: 1.0
183
+ mt_weight: 0.0
184
+ asr_weight: 0.5
185
+ lsm_weight: 0.1
186
+ length_normalized_loss: true
187
+ use_unpaired: true
188
+ asr_normalize: true
189
+ gumbel_softmax: true
190
+ use_preprocessor: true
191
+ token_type: char
192
+ bpemodel: null
193
+ non_linguistic_symbols: null
194
+ cleaner: null
195
+ g2p: null
196
+ use_multidecoder: true
197
+ speech_attn: false
198
+ ctc_conf:
199
+ dropout_rate: 0.0
200
+ ctc_type: builtin
201
+ reduce: true
202
+ ignore_nan_grad: true
203
+ zero_infinity: true
204
+ feats_extract: linear_spectrogram
205
+ feats_extract_conf:
206
+ n_fft: 1024
207
+ hop_length: 256
208
+ win_length: null
209
+ normalize: global_mvn
210
+ normalize_conf:
211
+ stats_file: exp/tts_stats_raw_linear_spectrogram_char/train/feats_stats.npz
212
+ tts: vits
213
+ tts_conf:
214
+ generator_type: vits_generator
215
+ generator_params:
216
+ hidden_channels: 256
217
+ spks: -1
218
+ spk_embed_dim: 512
219
+ global_channels: 256
220
+ segment_size: 32
221
+ text_encoder_attention_heads: 2
222
+ text_encoder_ffn_expand: 4
223
+ text_encoder_blocks: 6
224
+ text_encoder_positionwise_layer_type: conv1d
225
+ text_encoder_positionwise_conv_kernel_size: 3
226
+ text_encoder_positional_encoding_layer_type: rel_pos
227
+ text_encoder_self_attention_layer_type: rel_selfattn
228
+ text_encoder_activation_type: swish
229
+ text_encoder_normalize_before: true
230
+ text_encoder_dropout_rate: 0.1
231
+ text_encoder_positional_dropout_rate: 0.0
232
+ text_encoder_attention_dropout_rate: 0.1
233
+ use_macaron_style_in_text_encoder: true
234
+ use_conformer_conv_in_text_encoder: false
235
+ text_encoder_conformer_kernel_size: -1
236
+ decoder_kernel_size: 7
237
+ decoder_channels: 512
238
+ decoder_upsample_scales:
239
+ - 8
240
+ - 8
241
+ - 2
242
+ - 2
243
+ decoder_upsample_kernel_sizes:
244
+ - 16
245
+ - 16
246
+ - 4
247
+ - 4
248
+ decoder_resblock_kernel_sizes:
249
+ - 3
250
+ - 7
251
+ - 11
252
+ decoder_resblock_dilations:
253
+ - - 1
254
+ - 3
255
+ - 5
256
+ - - 1
257
+ - 3
258
+ - 5
259
+ - - 1
260
+ - 3
261
+ - 5
262
+ use_weight_norm_in_decoder: true
263
+ posterior_encoder_kernel_size: 5
264
+ posterior_encoder_layers: 16
265
+ posterior_encoder_stacks: 1
266
+ posterior_encoder_base_dilation: 1
267
+ posterior_encoder_dropout_rate: 0.0
268
+ use_weight_norm_in_posterior_encoder: true
269
+ flow_flows: 4
270
+ flow_kernel_size: 5
271
+ flow_base_dilation: 1
272
+ flow_layers: 4
273
+ flow_dropout_rate: 0.0
274
+ use_weight_norm_in_flow: true
275
+ use_only_mean_in_flow: true
276
+ stochastic_duration_predictor_kernel_size: 3
277
+ stochastic_duration_predictor_dropout_rate: 0.5
278
+ stochastic_duration_predictor_flows: 4
279
+ stochastic_duration_predictor_dds_conv_layers: 3
280
+ vocabs: 31
281
+ aux_channels: 513
282
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
283
+ discriminator_params:
284
+ scales: 1
285
+ scale_downsample_pooling: AvgPool1d
286
+ scale_downsample_pooling_params:
287
+ kernel_size: 4
288
+ stride: 2
289
+ padding: 2
290
+ scale_discriminator_params:
291
+ in_channels: 1
292
+ out_channels: 1
293
+ kernel_sizes:
294
+ - 15
295
+ - 41
296
+ - 5
297
+ - 3
298
+ channels: 128
299
+ max_downsample_channels: 1024
300
+ max_groups: 16
301
+ bias: true
302
+ downsample_scales:
303
+ - 2
304
+ - 2
305
+ - 4
306
+ - 4
307
+ - 1
308
+ nonlinear_activation: LeakyReLU
309
+ nonlinear_activation_params:
310
+ negative_slope: 0.1
311
+ use_weight_norm: true
312
+ use_spectral_norm: false
313
+ follow_official_norm: false
314
+ periods:
315
+ - 2
316
+ - 3
317
+ - 5
318
+ - 7
319
+ - 11
320
+ period_discriminator_params:
321
+ in_channels: 1
322
+ out_channels: 1
323
+ kernel_sizes:
324
+ - 5
325
+ - 3
326
+ channels: 32
327
+ downsample_scales:
328
+ - 3
329
+ - 3
330
+ - 3
331
+ - 3
332
+ - 1
333
+ max_downsample_channels: 1024
334
+ bias: true
335
+ nonlinear_activation: LeakyReLU
336
+ nonlinear_activation_params:
337
+ negative_slope: 0.1
338
+ use_weight_norm: true
339
+ use_spectral_norm: false
340
+ generator_adv_loss_params:
341
+ average_by_discriminators: false
342
+ loss_type: mse
343
+ discriminator_adv_loss_params:
344
+ average_by_discriminators: false
345
+ loss_type: mse
346
+ feat_match_loss_params:
347
+ average_by_discriminators: false
348
+ average_by_layers: false
349
+ include_final_outputs: true
350
+ mel_loss_params:
351
+ fs: 16000
352
+ n_fft: 1024
353
+ hop_length: 256
354
+ win_length: null
355
+ window: hann
356
+ n_mels: 80
357
+ fmin: 0
358
+ fmax: null
359
+ log_base: null
360
+ lambda_adv: 1.0
361
+ lambda_mel: 45.0
362
+ lambda_feat_match: 2.0
363
+ lambda_dur: 1.0
364
+ lambda_kl: 1.0
365
+ sampling_rate: 16000
366
+ cache_generator_outputs: true
367
+ use_md: true
368
+ skip_text_encoder: false
369
+ gumbel_softmax_input: true
370
+ pitch_extract: null
371
+ pitch_extract_conf: {}
372
+ pitch_normalize: null
373
+ pitch_normalize_conf: {}
374
+ energy_extract: null
375
+ energy_extract_conf: {}
376
+ energy_normalize: null
377
+ energy_normalize_conf: {}
378
+ asr_decoder: transformer
379
+ asr_decoder_conf:
380
+ attention_heads: 4
381
+ linear_units: 2048
382
+ num_blocks: 6
383
+ dropout_rate: 0.1
384
+ positional_dropout_rate: 0.1
385
+ self_attention_dropout_rate: 0.1
386
+ src_attention_dropout_rate: 0.1
387
+ asr_encoder: conformer
388
+ asr_encoder_conf:
389
+ output_size: 256
390
+ attention_heads: 4
391
+ linear_units: 1024
392
+ num_blocks: 12
393
+ dropout_rate: 0.1
394
+ positional_dropout_rate: 0.1
395
+ attention_dropout_rate: 0.1
396
+ input_layer: conv2d
397
+ normalize_before: true
398
+ macaron_style: true
399
+ rel_pos_type: latest
400
+ pos_enc_layer_type: rel_pos
401
+ selfattention_layer_type: rel_selfattn
402
+ activation_type: swish
403
+ use_cnn_module: true
404
+ cnn_module_kernel: 31
405
+ frontend: default
406
+ frontend_conf:
407
+ n_fft: 512
408
+ win_length: 400
409
+ hop_length: 160
410
+ required:
411
+ - output_dir
412
+ - token_list
413
+ version: '202205'
414
+ distributed: false
exp/tts_tmp/images/acc_asr.png ADDED
exp/tts_tmp/images/cer.png ADDED
exp/tts_tmp/images/cer_ctc.png ADDED
exp/tts_tmp/images/generator_backward_time.png ADDED
exp/tts_tmp/images/generator_forward_time.png ADDED
exp/tts_tmp/images/generator_optim_step_time.png ADDED
exp/tts_tmp/images/generator_train_time.png ADDED
exp/tts_tmp/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_tmp/images/iter_time.png ADDED
exp/tts_tmp/images/loss.png ADDED
exp/tts_tmp/images/loss_asr.png ADDED
exp/tts_tmp/images/optim0_lr0.png ADDED
exp/tts_tmp/images/text_embed_loss.png ADDED
exp/tts_tmp/images/train_time.png ADDED
exp/tts_tmp/images/tts_generator_mel_loss.png ADDED
exp/tts_tmp/images/wer.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202205'
2
+ files:
3
+ model_file: exp/tts_tmp/1epoch.pth
4
+ python: "3.9.16 (main, Jan 11 2023, 16:05:54) \n[GCC 11.2.0]"
5
+ timestamp: 1682307730.510395
6
+ torch: 1.11.0
7
+ yaml_files:
8
+ train_config: exp/tts_tmp/config.yaml