add config
Browse files
config/config_spect_c16.yaml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
|
3 |
+
wav_dir: 'wav16'
|
4 |
+
spect_dir: 'spect'
|
5 |
+
f0_norm_dir: 'f0_norm'
|
6 |
+
avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_avg_embs.pkl'
|
7 |
+
speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_emb_gmms.pkl'
|
8 |
+
seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
|
9 |
+
seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
|
10 |
+
f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
|
11 |
+
#############################
|
12 |
+
train:
|
13 |
+
num_workers: 8
|
14 |
+
num_gpus: 1
|
15 |
+
batch_size: 32
|
16 |
+
optimizer: 'adam'
|
17 |
+
seed: 1234
|
18 |
+
adam:
|
19 |
+
lr: 0.0001
|
20 |
+
beta1: 0.5
|
21 |
+
beta2: 0.9
|
22 |
+
stft_lamb: 2.5
|
23 |
+
use_wav2vec: False
|
24 |
+
use_gmm_emb: True
|
25 |
+
warp_lq: True
|
26 |
+
use_ssc: False
|
27 |
+
#############################
|
28 |
+
audio:
|
29 |
+
feat_dim: 80
|
30 |
+
n_mel_channels: 80
|
31 |
+
f0_norm_dim: 257
|
32 |
+
spk_emb_dim: 512
|
33 |
+
spk_quant_f0_dim: 64
|
34 |
+
segment_length: 16384
|
35 |
+
pad_short: 2000
|
36 |
+
filter_length: 1024
|
37 |
+
hop_length: 256
|
38 |
+
win_length: 1024
|
39 |
+
wav2vec_hop_length: None
|
40 |
+
sampling_rate: 16000
|
41 |
+
mel_fmin: 0.0
|
42 |
+
mel_fmax: 8000.0
|
43 |
+
#############################
|
44 |
+
gen:
|
45 |
+
noise_dim: 64
|
46 |
+
channel_size: 16
|
47 |
+
dilations: [1, 3, 9, 27]
|
48 |
+
strides: [8, 8, 4]
|
49 |
+
lReLU_slope: 0.2
|
50 |
+
kpnet_conv_size: 3
|
51 |
+
#############################
|
52 |
+
ssc:
|
53 |
+
se:
|
54 |
+
spk_emb_dim: 512
|
55 |
+
num_filters: [16, 32, 64, 128]
|
56 |
+
layers: [3, 4, 6, 3]
|
57 |
+
pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
|
58 |
+
stft_annealing_step: 2000
|
59 |
+
pos_ssc_lamb: 0.9
|
60 |
+
neg_ssc_lamb: 0.0
|
61 |
+
ssc_annealing_step: 2000
|
62 |
+
num_ssc_samples: 8
|
63 |
+
finetune_epochs: 3
|
64 |
+
#############################
|
65 |
+
mpd:
|
66 |
+
periods: [2, 3, 5, 7, 11]
|
67 |
+
kernel_size: 5
|
68 |
+
stride: 3
|
69 |
+
use_spectral_norm: False
|
70 |
+
lReLU_slope: 0.2
|
71 |
+
#############################
|
72 |
+
mrd:
|
73 |
+
resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
|
74 |
+
use_spectral_norm: False
|
75 |
+
lReLU_slope: 0.2
|
76 |
+
#############################
|
77 |
+
log:
|
78 |
+
summary_interval: 10
|
79 |
+
validation_interval: 1
|
80 |
+
save_interval: 1
|
81 |
+
num_audio: 5
|
82 |
+
chkpt_dir: 'chkpt'
|
83 |
+
log_dir: 'logs'
|
84 |
+
ssc_validation_interval_steps: 400
|
config/config_spect_c16_ssc.yaml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
|
3 |
+
wav_dir: 'wav16'
|
4 |
+
spect_dir: 'spect'
|
5 |
+
f0_norm_dir: 'f0_norm'
|
6 |
+
avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_avg_embs.pkl'
|
7 |
+
speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_emb_gmms.pkl'
|
8 |
+
seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
|
9 |
+
seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
|
10 |
+
f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
|
11 |
+
#############################
|
12 |
+
train:
|
13 |
+
num_workers: 8
|
14 |
+
num_gpus: 1
|
15 |
+
batch_size: 16
|
16 |
+
optimizer: 'adam'
|
17 |
+
seed: 1234
|
18 |
+
adam:
|
19 |
+
lr: 0.00005
|
20 |
+
beta1: 0.5
|
21 |
+
beta2: 0.9
|
22 |
+
stft_lamb: 2.5
|
23 |
+
use_wav2vec: False
|
24 |
+
use_gmm_emb: True
|
25 |
+
warp_lq: True
|
26 |
+
use_ssc: True
|
27 |
+
#############################
|
28 |
+
audio:
|
29 |
+
feat_dim: 80
|
30 |
+
n_mel_channels: 80
|
31 |
+
f0_norm_dim: 257
|
32 |
+
spk_emb_dim: 512
|
33 |
+
spk_quant_f0_dim: 64
|
34 |
+
segment_length: 16384
|
35 |
+
pad_short: 2000
|
36 |
+
filter_length: 1024
|
37 |
+
hop_length: 256
|
38 |
+
win_length: 1024
|
39 |
+
wav2vec_hop_length: None
|
40 |
+
sampling_rate: 16000
|
41 |
+
mel_fmin: 0.0
|
42 |
+
mel_fmax: 8000.0
|
43 |
+
#############################
|
44 |
+
gen:
|
45 |
+
noise_dim: 64
|
46 |
+
channel_size: 16
|
47 |
+
dilations: [1, 3, 9, 27]
|
48 |
+
strides: [8, 8, 4]
|
49 |
+
lReLU_slope: 0.2
|
50 |
+
kpnet_conv_size: 3
|
51 |
+
#############################
|
52 |
+
ssc:
|
53 |
+
se:
|
54 |
+
spk_emb_dim: 512
|
55 |
+
num_filters: [16, 32, 64, 128]
|
56 |
+
layers: [3, 4, 6, 3]
|
57 |
+
pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
|
58 |
+
stft_annealing_step: 2000
|
59 |
+
pos_ssc_lamb: 0.9
|
60 |
+
neg_ssc_lamb: 0.0
|
61 |
+
ssc_annealing_step: 2000
|
62 |
+
num_ssc_samples: 8
|
63 |
+
finetune_epochs: 3
|
64 |
+
#############################
|
65 |
+
mpd:
|
66 |
+
periods: [2, 3, 5, 7, 11]
|
67 |
+
kernel_size: 5
|
68 |
+
stride: 3
|
69 |
+
use_spectral_norm: False
|
70 |
+
lReLU_slope: 0.2
|
71 |
+
#############################
|
72 |
+
mrd:
|
73 |
+
resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
|
74 |
+
use_spectral_norm: False
|
75 |
+
lReLU_slope: 0.2
|
76 |
+
#############################
|
77 |
+
log:
|
78 |
+
summary_interval: 10
|
79 |
+
validation_interval: 1
|
80 |
+
save_interval: 1
|
81 |
+
num_audio: 5
|
82 |
+
chkpt_dir: 'chkpt'
|
83 |
+
log_dir: 'logs'
|
84 |
+
ssc_validation_interval_steps: 400
|
config/config_wav2vec_ecapa_c16.yaml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
|
3 |
+
wav_dir: 'wav16'
|
4 |
+
spect_dir: None
|
5 |
+
f0_norm_dir: 'f0_norm_wav2vec'
|
6 |
+
avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_avg_embs.pkl'
|
7 |
+
speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_emb_gmms.pkl'
|
8 |
+
seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
|
9 |
+
seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
|
10 |
+
f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
|
11 |
+
#############################
|
12 |
+
train:
|
13 |
+
num_workers: 8
|
14 |
+
num_gpus: 1
|
15 |
+
batch_size: 32
|
16 |
+
optimizer: 'adam'
|
17 |
+
seed: 1234
|
18 |
+
adam:
|
19 |
+
lr: 0.0001
|
20 |
+
beta1: 0.5
|
21 |
+
beta2: 0.9
|
22 |
+
stft_lamb: 2.5
|
23 |
+
use_wav2vec: True
|
24 |
+
use_gmm_emb: True
|
25 |
+
warp_lq: False
|
26 |
+
use_ssc: False
|
27 |
+
#############################
|
28 |
+
audio:
|
29 |
+
feat_dim: 1024
|
30 |
+
n_mel_channels: 80
|
31 |
+
f0_norm_dim: 257
|
32 |
+
spk_emb_dim: 192
|
33 |
+
spk_quant_f0_dim: 64
|
34 |
+
segment_length: 16080
|
35 |
+
pad_short: 2000
|
36 |
+
filter_length: 1024
|
37 |
+
hop_length: 256
|
38 |
+
win_length: 1024
|
39 |
+
wav2vec_hop_length: 320
|
40 |
+
sampling_rate: 16000
|
41 |
+
mel_fmin: 0.0
|
42 |
+
mel_fmax: 8000.0
|
43 |
+
#############################
|
44 |
+
gen:
|
45 |
+
noise_dim: 50
|
46 |
+
channel_size: 16
|
47 |
+
dilations: [1, 3, 9, 27]
|
48 |
+
strides: [8, 8, 5]
|
49 |
+
lReLU_slope: 0.2
|
50 |
+
kpnet_conv_size: 3
|
51 |
+
#############################
|
52 |
+
ssc:
|
53 |
+
se:
|
54 |
+
spk_emb_dim: 512
|
55 |
+
num_filters: [16, 32, 64, 128]
|
56 |
+
layers: [3, 4, 6, 3]
|
57 |
+
pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
|
58 |
+
stft_annealing_step: 2000
|
59 |
+
pos_ssc_lamb: 0.9
|
60 |
+
neg_ssc_lamb: 0.0
|
61 |
+
ssc_annealing_step: 2000
|
62 |
+
num_ssc_samples: 8
|
63 |
+
finetune_epochs: 3
|
64 |
+
#############################
|
65 |
+
mpd:
|
66 |
+
periods: [2, 3, 5, 7, 11]
|
67 |
+
kernel_size: 5
|
68 |
+
stride: 3
|
69 |
+
use_spectral_norm: False
|
70 |
+
lReLU_slope: 0.2
|
71 |
+
#############################
|
72 |
+
mrd:
|
73 |
+
resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
|
74 |
+
use_spectral_norm: False
|
75 |
+
lReLU_slope: 0.2
|
76 |
+
#############################
|
77 |
+
log:
|
78 |
+
summary_interval: 10
|
79 |
+
validation_interval: 1
|
80 |
+
save_interval: 1
|
81 |
+
num_audio: 5
|
82 |
+
chkpt_dir: 'chkpt'
|
83 |
+
log_dir: 'logs'
|
84 |
+
ssc_validation_interval_steps: 400
|
config/config_wav2vec_ecapa_c32.yaml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data:
|
2 |
+
root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
|
3 |
+
wav_dir: 'wav16'
|
4 |
+
spect_dir: None
|
5 |
+
f0_norm_dir: 'f0_norm_wav2vec'
|
6 |
+
avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_avg_embs.pkl'
|
7 |
+
speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_emb_gmms.pkl'
|
8 |
+
seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
|
9 |
+
seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
|
10 |
+
f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
|
11 |
+
#############################
|
12 |
+
train:
|
13 |
+
num_workers: 8
|
14 |
+
num_gpus: 1
|
15 |
+
batch_size: 32
|
16 |
+
optimizer: 'adam'
|
17 |
+
seed: 1234
|
18 |
+
adam:
|
19 |
+
lr: 0.0001
|
20 |
+
beta1: 0.5
|
21 |
+
beta2: 0.9
|
22 |
+
stft_lamb: 2.5
|
23 |
+
use_wav2vec: True
|
24 |
+
use_gmm_emb: True
|
25 |
+
warp_lq: False
|
26 |
+
use_ssc: False
|
27 |
+
#############################
|
28 |
+
audio:
|
29 |
+
feat_dim: 1024
|
30 |
+
n_mel_channels: 80
|
31 |
+
f0_norm_dim: 257
|
32 |
+
spk_emb_dim: 192
|
33 |
+
spk_quant_f0_dim: 64
|
34 |
+
segment_length: 16080
|
35 |
+
pad_short: 2000
|
36 |
+
filter_length: 1024
|
37 |
+
hop_length: 256
|
38 |
+
win_length: 1024
|
39 |
+
wav2vec_hop_length: 320
|
40 |
+
sampling_rate: 16000
|
41 |
+
mel_fmin: 0.0
|
42 |
+
mel_fmax: 8000.0
|
43 |
+
#############################
|
44 |
+
gen:
|
45 |
+
noise_dim: 50
|
46 |
+
channel_size: 32
|
47 |
+
dilations: [1, 3, 9, 27]
|
48 |
+
strides: [8, 8, 5]
|
49 |
+
lReLU_slope: 0.2
|
50 |
+
kpnet_conv_size: 3
|
51 |
+
#############################
|
52 |
+
ssc:
|
53 |
+
se:
|
54 |
+
spk_emb_dim: 512
|
55 |
+
num_filters: [16, 32, 64, 128]
|
56 |
+
layers: [3, 4, 6, 3]
|
57 |
+
pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
|
58 |
+
stft_annealing_step: 2000
|
59 |
+
pos_ssc_lamb: 0.9
|
60 |
+
neg_ssc_lamb: 0.0
|
61 |
+
ssc_annealing_step: 2000
|
62 |
+
num_ssc_samples: 8
|
63 |
+
finetune_epochs: 3
|
64 |
+
#############################
|
65 |
+
mpd:
|
66 |
+
periods: [2, 3, 5, 7, 11]
|
67 |
+
kernel_size: 5
|
68 |
+
stride: 3
|
69 |
+
use_spectral_norm: False
|
70 |
+
lReLU_slope: 0.2
|
71 |
+
#############################
|
72 |
+
mrd:
|
73 |
+
resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
|
74 |
+
use_spectral_norm: False
|
75 |
+
lReLU_slope: 0.2
|
76 |
+
#############################
|
77 |
+
log:
|
78 |
+
summary_interval: 10
|
79 |
+
validation_interval: 1
|
80 |
+
save_interval: 1
|
81 |
+
num_audio: 5
|
82 |
+
chkpt_dir: 'chkpt'
|
83 |
+
log_dir: 'logs'
|
84 |
+
ssc_validation_interval_steps: 400
|