maskgct-audio-lab / egs /tts /NaturalSpeech2 /exp_config_base.json
Hecheng0625's picture
Upload 167 files
8c92a11 verified
raw
history blame
3.41 kB
{
"base_config": "config/ns2.json",
"model_type": "NaturalSpeech2",
"dataset": [
"libritts"
],
"preprocess": {
"use_mel": false,
"use_code": true,
"use_spkid": true,
"use_pitch": true,
"use_duration": true,
"use_phone": true,
"use_len": true,
"use_cross_reference": true,
"train_file": "train.json",
"valid_file": "test.json",
"melspec_dir": "mel",
"code_dir": "code",
"pitch_dir": "pitch",
"duration_dir": "duration",
"metadata_dir": "metadata",
"read_metadata": true,
"clip_mode": "start"
},
"model": {
"latent_dim": 128,
"prior_encoder": {
"vocab_size": 100,
"pitch_min": 50,
"pitch_max": 1100,
"pitch_bins_num": 512,
"encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": true
},
"duration_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 3,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
},
"pitch_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 5,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
}
},
"diffusion": {
"wavenet": {
"input_size": 128,
"hidden_size": 512,
"out_size": 128,
"num_layers": 40,
"cross_attn_per_layer": 3,
"dilation_cycle": 2,
"attn_head": 8,
"drop_out": 0.2
},
"beta_min": 0.05,
"beta_max": 20,
"sigma": 1.0,
"noise_factor": 1.0,
"ode_solver": "euler",
"diffusion_type": "diffusion"
},
"prompt_encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": false
},
"query_emb": {
"query_token_num": 32,
"hidden_size": 512,
"head_num": 8
},
"inference_step": 500
},
"train": {
"use_dynamic_batchsize": true,
"max_tokens": 7500,
"max_sentences": 32,
"lr_warmup_steps": 5000,
"lr_scheduler": "cosine",
"num_train_steps": 800000,
"adam": {
"lr": 7.5e-5
},
"diff_ce_loss_lambda": 0.5,
"diff_noise_loss_lambda": 1.0,
"ddp": false,
"random_seed": 114,
"batch_size": 32,
"epochs": 5000,
"max_steps": 1000000,
"total_training_steps": 800000,
"save_summary_steps": 500,
"save_checkpoints_steps": 2000,
"valid_interval": 2000,
"keep_checkpoint_max": 100
}
}