|
audio_tokenizer: |
|
mel_params: |
|
sample_rate: 16000 |
|
n_fft: 1024 |
|
win_length: 640 |
|
hop_length: 320 |
|
mel_fmin: 10 |
|
mel_fmax: null |
|
num_mels: 128 |
|
|
|
encoder: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 12 |
|
out_channels: 1024 |
|
sample_ratios: [1,1] |
|
|
|
decoder: |
|
input_channel: 1024 |
|
channels: 1536 |
|
rates: [8, 5, 4, 2] |
|
kernel_sizes: [16,11,8,4] |
|
|
|
quantizer: |
|
input_dim: 1024 |
|
codebook_size: 8192 |
|
codebook_dim: 8 |
|
commitment: 0.25 |
|
codebook_loss_weight: 2.0 |
|
use_l2_normlize: True |
|
threshold_ema_dead_code: 0.2 |
|
|
|
speaker_encoder: |
|
input_dim: 128 |
|
out_dim: 1024 |
|
latent_dim: 128 |
|
token_num: 32 |
|
fsq_levels: [4, 4, 4, 4, 4, 4] |
|
fsq_num_quantizers: 1 |
|
|
|
prenet: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 12 |
|
out_channels: 1024 |
|
condition_dim: 1024 |
|
sample_ratios: [1,1] |
|
use_tanh_at_final: False |
|
|
|
postnet: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 6 |
|
out_channels: 1024 |
|
use_tanh_at_final: False |
|
|
|
|
|
|