File size: 2,270 Bytes
cbb6f34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
generator:
  name: SoundStream
  config:
    n_filters: 32
    D: 256
    target_bandwidths:
    - 0.5
    - 1
    - 1.5
    - 2
    - 4
    ratios:
    - 8
    - 5
    - 4
    - 2
    sample_rate: 16000
    bins: 1024
    semantic_techer: hubert_base_general
d_list:
- mfd
mfd:
  name: MultiFrequencyDiscriminator
  config:
    hop_lengths:
    - 32
    - 64
    - 128
    - 256
    - 512
    - 1024
    hidden_channels:
    - 64
    - 128
    - 256
    - 512
    - 512
    - 512
    domain: double
    mel_scale: true
    sample_rate: 16000
mpd:
  name: MultiPeriodDiscriminator
  config:
    period_sizes:
    - 2
    - 3
    - 5
    - 7
    - 11
    period_kernel_size: 5
msd:
  name: MultiScaleDiscriminator
  config:
    num_scales: 3
    pool_kernel_size: 4
    pool_stride: 2
optimizer:
  g:
    name: AdamW
    config:
      lr: 0.0002
      betas:
      - 0.8
      - 0.99
      eps: 1.0e-06
  d:
    name: AdamW
    config:
      lr: 0.0002
      betas:
      - 0.8
      - 0.99
      eps: 1.0e-06
lr_scheduler:
  g:
    name: ExponentialLR
    config:
      gamma: 0.999
  d:
    name: ExponentialLR
    config:
      gamma: 0.999
criterion:
  g_criterion:
    name: losses.generator_loss.GeneratorSTFTLoss
    config:
      use_mel_loss: false
      adv_criterion: MSEGLoss
      mel_loss_weight: 45
      use_feature_match: true
      feat_match_loss_weight: 20
      use_full_stft_loss: true
      use_sub_stft_loss: true
      full_stft_loss_weight: 1
      sub_stft_loss_weight: 1
      mel_scale_loss:
        sampling_rate: 16000
        n_fft: 1024
        num_mels: 80
        hop_size: 160
        win_size: 800
        fmin: 0
      full_multi_scale_stft_loss:
        fft_sizes:
        - 512
        - 1024
        - 2048
        win_sizes:
        - 480
        - 960
        - 1200
        hop_sizes:
        - 120
        - 240
        - 300
      sub_multi_scale_stft_loss:
        num_bands: 6
        fft_sizes:
        - 128
        - 256
        - 256
        win_sizes:
        - 80
        - 120
        - 200
        hop_sizes:
        - 20
        - 40
        - 50
  d_criterion:
    name: losses.discriminator_loss.MSEDiscriminatorLoss
    config: null
  commit_loss_weight: 1.0
  codebook_loss_weight: 100
audio_norm_scale: 0.95