audio_tokenizer:
  mel_params:
    sample_rate: 16000
    n_fft: 1024
    win_length: 640
    hop_length: 320
    mel_fmin: 10 
    mel_fmax: null
    num_mels: 128

  encoder:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    sample_ratios: [1,1]

  decoder:
    input_channel: 1024
    channels: 1536
    rates: [8, 5, 4, 2]
    kernel_sizes: [16,11,8,4]

  quantizer:
    input_dim: 1024
    codebook_size: 8192
    codebook_dim: 8
    commitment: 0.25
    codebook_loss_weight: 2.0
    use_l2_normlize: True
    threshold_ema_dead_code: 0.2
  
  speaker_encoder:
    input_dim: 128
    out_dim: 1024
    latent_dim: 128
    token_num: 32
    fsq_levels: [4, 4, 4, 4, 4, 4]
    fsq_num_quantizers: 1

  prenet:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    condition_dim: 1024
    sample_ratios: [1,1]
    use_tanh_at_final: False

  postnet: 
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 6
    out_channels: 1024
    use_tanh_at_final: False