log_dir: "" save_freq: 1 log_interval: 10 save_interval: 1000 device: "cuda" epochs: 1000 # number of epochs for first stage training (pre-training) batch_size: 2 batch_length: 100 # maximum duration of audio in a batch (in seconds) max_len: 80 # maximum number of frames pretrained_model: "" pretrained_encoder: "" load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters preprocess_params: sr: 22050 spect_params: n_fft: 1024 win_length: 1024 hop_length: 256 n_mels: 80 model_params: dit_type: "DiT" # uDiT or DiT reg_loss_type: "l1" # l1 or l2 speech_tokenizer: type: 'facodec' path: "speech_tokenizer_v1.onnx" cosyvoice: path: "../CosyVoice/pretrained_models/CosyVoice-300M" style_encoder: dim: 192 campplus_path: "campplus_cn_common.bin" DAC: encoder_dim: 64 encoder_rates: [2, 5, 5, 6] decoder_dim: 1536 decoder_rates: [ 6, 5, 5, 2 ] sr: 24000 length_regulator: channels: 512 is_discrete: true content_codebook_size: 1024 in_frame_rate: 80 out_frame_rate: 80 sampling_ratios: [1, 1, 1, 1] token_dropout_prob: 0.3 # probability of performing token dropout token_dropout_range: 1.0 # maximum percentage of tokens to drop out n_codebooks: 3 quantizer_dropout: 0.5 f0_condition: true n_f0_bins: 512 DiT: hidden_dim: 512 num_heads: 8 depth: 13 class_dropout_prob: 0.1 block_size: 8192 in_channels: 80 style_condition: true final_layer_type: 'wavenet' target: 'mel' # mel or codec content_dim: 512 content_codebook_size: 1024 content_type: 'discrete' f0_condition: true n_f0_bins: 512 content_codebooks: 1 is_causal: false long_skip_connection: true zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token time_as_token: false style_as_token: false uvit_skip_connection: true add_resblock_in_transformer: false wavenet: hidden_dim: 512 num_layers: 8 kernel_size: 5 dilation_rate: 1 p_dropout: 0.2 style_condition: true loss_params: base_lr: 0.0001 lambda_mel: 45 lambda_kl: 1.0