Tacotron2-DDC / tts-config.json
Thorsten-Voice's picture
TTS model trained on Coqui TTS (Tacotron2 DDC).
0bc5779
{
"output_path": "/ssd/___tts/thorsten-ddc/output",
"logger_uri": null,
"run_name": "Thorsten-Dec2021-22k-DDC",
"project_name": null,
"run_description": "\ud83d\udc38Coqui trainer run.",
"print_step": 25,
"plot_step": 100,
"model_param_stats": false,
"wandb_entity": null,
"dashboard_logger": "tensorboard",
"log_model_step": null,
"save_step": 2000,
"save_n_checkpoints": 5,
"save_checkpoints": true,
"save_all_best": true,
"save_best_after": 10000,
"target_loss": null,
"print_eval": true,
"test_delay_epochs": -1,
"run_eval": true,
"run_eval_steps": null,
"distributed_backend": "nccl",
"distributed_url": "tcp://localhost:54321",
"mixed_precision": false,
"epochs": 1000,
"batch_size": 64,
"eval_batch_size": 16,
"grad_clip": 5.0,
"scheduler_after_epoch": false,
"lr": 0.0001,
"optimizer": "RAdam",
"optimizer_params": {
"betas": [
0.9,
0.998
],
"weight_decay": 1e-06
},
"lr_scheduler": "NoamLR",
"lr_scheduler_params": {
"warmup_steps": 4000
},
"use_grad_scaler": false,
"cudnn_enable": true,
"cudnn_deterministic": false,
"cudnn_benchmark": false,
"training_seed": 54321,
"model": "tacotron2",
"num_loader_workers": 4,
"num_eval_loader_workers": 4,
"use_noise_augment": false,
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_shift_ms": null,
"frame_length_ms": null,
"stft_pad_mode": "reflect",
"sample_rate": 22050,
"resample": false,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_sound_norm": false,
"log_func": "np.log",
"do_trim_silence": true,
"trim_db": 60,
"do_rms_norm": false,
"db_level": null,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 50.0,
"mel_fmax": null,
"spec_gain": 1.0,
"do_amp_to_db_linear": true,
"do_amp_to_db_mel": true,
"pitch_fmax": 640.0,
"pitch_fmin": 0.0,
"signal_norm": false,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": null
},
"use_phonemes": true,
"phonemizer": "espeak",
"phoneme_language": "de",
"compute_input_seq_cache": false,
"text_cleaner": "basic_german_cleaners",
"enable_eos_bos_chars": false,
"test_sentences_file": "",
"phoneme_cache_path": "/ssd/___tts/thorsten-ddc/output/phoneme_cache",
"characters": {
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
"vocab_dict": null,
"pad": "<PAD>",
"eos": "<EOS>",
"bos": "<BOS>",
"blank": "<BLNK>",
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
"punctuations": "!'(),-.:;? ",
"phonemes": null,
"is_unique": false,
"is_sorted": true
},
"add_blank": false,
"batch_group_size": 0,
"loss_masking": true,
"sort_by_audio_len": false,
"min_audio_len": 1,
"max_audio_len": Infinity,
"min_text_len": 1,
"max_text_len": Infinity,
"compute_f0": false,
"compute_linear_spec": false,
"precompute_num_workers": 8,
"start_by_longest": false,
"datasets": [
{
"name": "ljspeech",
"path": "/ssd/___tts/datasets/Thorsten-Neutral-Dec2021-22k/",
"meta_file_train": "metadata_train.csv",
"ignored_speakers": null,
"language": "",
"meta_file_val": "metadata_test.csv",
"meta_file_attn_mask": ""
}
],
"test_sentences": [
"und \u00fcberzeugen dank feingef\u00fchl f\u00fcr den ganz gro\u00dfen leinwand-stoff.",
"zur schadensh\u00f6he gab es keine angaben.",
"au\u00dferdem k\u00f6nnen glasscheiben, w\u00e4nde und andere hindernisse das ergebnis beeinflussen.",
"ihre lippen m\u00fcssen dennoch nicht zwingend auf farbe verzichten.",
"es dauert lange, eine eigene Stimme zu entwickeln, aber jetzt wo ich sie habe, bin ich nie wieder still.",
"heute scheint die Sonne, Regen ist nicht zu erwarten.",
"die aktuelle Au\u00dfentemperatur betr\u00e4gt zw\u00f6lf Grad Celsius bei einer Luftfeuchtigkeit von achtunddrei\u00dfig Prozent."
],
"eval_split_max_size": null,
"eval_split_size": 0.01,
"use_speaker_weighted_sampler": false,
"speaker_weighted_sampler_alpha": 1.0,
"use_language_weighted_sampler": false,
"language_weighted_sampler_alpha": 1.0,
"use_length_weighted_sampler": false,
"length_weighted_sampler_alpha": 1.0,
"use_gst": false,
"gst": null,
"gst_style_input": null,
"use_capacitron_vae": false,
"capacitron_vae": null,
"num_speakers": 1,
"num_chars": 131,
"r": 6,
"gradual_training": [
[
0,
6,
64
],
[
10000,
4,
32
],
[
50000,
3,
32
],
[
100000,
2,
32
]
],
"memory_size": -1,
"prenet_type": "original",
"prenet_dropout": true,
"prenet_dropout_at_inference": false,
"stopnet": true,
"separate_stopnet": true,
"stopnet_pos_weight": 10.0,
"max_decoder_steps": 500,
"encoder_in_features": 512,
"decoder_in_features": 512,
"decoder_output_dim": 80,
"out_channels": 80,
"attention_type": "original",
"attention_heads": null,
"attention_norm": "sigmoid",
"attention_win": false,
"windowing": false,
"use_forward_attn": false,
"forward_attn_mask": false,
"transition_agent": false,
"location_attn": true,
"bidirectional_decoder": false,
"double_decoder_consistency": true,
"ddc_r": 6,
"speakers_file": null,
"use_speaker_embedding": false,
"speaker_embedding_dim": 512,
"use_d_vector_file": false,
"d_vector_file": false,
"d_vector_dim": null,
"seq_len_norm": false,
"decoder_loss_alpha": 0.25,
"postnet_loss_alpha": 0.25,
"postnet_diff_spec_alpha": 0.25,
"decoder_diff_spec_alpha": 0.25,
"decoder_ssim_alpha": 0.25,
"postnet_ssim_alpha": 0.25,
"ga_alpha": 5.0,
"github_branch": "* main"
}