mrfakename commited on
Commit
780c9a9
·
verified ·
1 Parent(s): 16f3dc7

add config

Browse files
config/config_spect_c16.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
3
+ wav_dir: 'wav16'
4
+ spect_dir: 'spect'
5
+ f0_norm_dir: 'f0_norm'
6
+ avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_avg_embs.pkl'
7
+ speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_emb_gmms.pkl'
8
+ seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
9
+ seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
10
+ f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
11
+ #############################
12
+ train:
13
+ num_workers: 8
14
+ num_gpus: 1
15
+ batch_size: 32
16
+ optimizer: 'adam'
17
+ seed: 1234
18
+ adam:
19
+ lr: 0.0001
20
+ beta1: 0.5
21
+ beta2: 0.9
22
+ stft_lamb: 2.5
23
+ use_wav2vec: False
24
+ use_gmm_emb: True
25
+ warp_lq: True
26
+ use_ssc: False
27
+ #############################
28
+ audio:
29
+ feat_dim: 80
30
+ n_mel_channels: 80
31
+ f0_norm_dim: 257
32
+ spk_emb_dim: 512
33
+ spk_quant_f0_dim: 64
34
+ segment_length: 16384
35
+ pad_short: 2000
36
+ filter_length: 1024
37
+ hop_length: 256
38
+ win_length: 1024
39
+ wav2vec_hop_length: None
40
+ sampling_rate: 16000
41
+ mel_fmin: 0.0
42
+ mel_fmax: 8000.0
43
+ #############################
44
+ gen:
45
+ noise_dim: 64
46
+ channel_size: 16
47
+ dilations: [1, 3, 9, 27]
48
+ strides: [8, 8, 4]
49
+ lReLU_slope: 0.2
50
+ kpnet_conv_size: 3
51
+ #############################
52
+ ssc:
53
+ se:
54
+ spk_emb_dim: 512
55
+ num_filters: [16, 32, 64, 128]
56
+ layers: [3, 4, 6, 3]
57
+ pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
58
+ stft_annealing_step: 2000
59
+ pos_ssc_lamb: 0.9
60
+ neg_ssc_lamb: 0.0
61
+ ssc_annealing_step: 2000
62
+ num_ssc_samples: 8
63
+ finetune_epochs: 3
64
+ #############################
65
+ mpd:
66
+ periods: [2, 3, 5, 7, 11]
67
+ kernel_size: 5
68
+ stride: 3
69
+ use_spectral_norm: False
70
+ lReLU_slope: 0.2
71
+ #############################
72
+ mrd:
73
+ resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
74
+ use_spectral_norm: False
75
+ lReLU_slope: 0.2
76
+ #############################
77
+ log:
78
+ summary_interval: 10
79
+ validation_interval: 1
80
+ save_interval: 1
81
+ num_audio: 5
82
+ chkpt_dir: 'chkpt'
83
+ log_dir: 'logs'
84
+ ssc_validation_interval_steps: 400
config/config_spect_c16_ssc.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
3
+ wav_dir: 'wav16'
4
+ spect_dir: 'spect'
5
+ f0_norm_dir: 'f0_norm'
6
+ avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_avg_embs.pkl'
7
+ speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/fast_resnet34_emb_gmms.pkl'
8
+ seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
9
+ seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
10
+ f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
11
+ #############################
12
+ train:
13
+ num_workers: 8
14
+ num_gpus: 1
15
+ batch_size: 16
16
+ optimizer: 'adam'
17
+ seed: 1234
18
+ adam:
19
+ lr: 0.00005
20
+ beta1: 0.5
21
+ beta2: 0.9
22
+ stft_lamb: 2.5
23
+ use_wav2vec: False
24
+ use_gmm_emb: True
25
+ warp_lq: True
26
+ use_ssc: True
27
+ #############################
28
+ audio:
29
+ feat_dim: 80
30
+ n_mel_channels: 80
31
+ f0_norm_dim: 257
32
+ spk_emb_dim: 512
33
+ spk_quant_f0_dim: 64
34
+ segment_length: 16384
35
+ pad_short: 2000
36
+ filter_length: 1024
37
+ hop_length: 256
38
+ win_length: 1024
39
+ wav2vec_hop_length: None
40
+ sampling_rate: 16000
41
+ mel_fmin: 0.0
42
+ mel_fmax: 8000.0
43
+ #############################
44
+ gen:
45
+ noise_dim: 64
46
+ channel_size: 16
47
+ dilations: [1, 3, 9, 27]
48
+ strides: [8, 8, 4]
49
+ lReLU_slope: 0.2
50
+ kpnet_conv_size: 3
51
+ #############################
52
+ ssc:
53
+ se:
54
+ spk_emb_dim: 512
55
+ num_filters: [16, 32, 64, 128]
56
+ layers: [3, 4, 6, 3]
57
+ pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
58
+ stft_annealing_step: 2000
59
+ pos_ssc_lamb: 0.9
60
+ neg_ssc_lamb: 0.0
61
+ ssc_annealing_step: 2000
62
+ num_ssc_samples: 8
63
+ finetune_epochs: 3
64
+ #############################
65
+ mpd:
66
+ periods: [2, 3, 5, 7, 11]
67
+ kernel_size: 5
68
+ stride: 3
69
+ use_spectral_norm: False
70
+ lReLU_slope: 0.2
71
+ #############################
72
+ mrd:
73
+ resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
74
+ use_spectral_norm: False
75
+ lReLU_slope: 0.2
76
+ #############################
77
+ log:
78
+ summary_interval: 10
79
+ validation_interval: 1
80
+ save_interval: 1
81
+ num_audio: 5
82
+ chkpt_dir: 'chkpt'
83
+ log_dir: 'logs'
84
+ ssc_validation_interval_steps: 400
config/config_wav2vec_ecapa_c16.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
3
+ wav_dir: 'wav16'
4
+ spect_dir: None
5
+ f0_norm_dir: 'f0_norm_wav2vec'
6
+ avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_avg_embs.pkl'
7
+ speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_emb_gmms.pkl'
8
+ seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
9
+ seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
10
+ f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
11
+ #############################
12
+ train:
13
+ num_workers: 8
14
+ num_gpus: 1
15
+ batch_size: 32
16
+ optimizer: 'adam'
17
+ seed: 1234
18
+ adam:
19
+ lr: 0.0001
20
+ beta1: 0.5
21
+ beta2: 0.9
22
+ stft_lamb: 2.5
23
+ use_wav2vec: True
24
+ use_gmm_emb: True
25
+ warp_lq: False
26
+ use_ssc: False
27
+ #############################
28
+ audio:
29
+ feat_dim: 1024
30
+ n_mel_channels: 80
31
+ f0_norm_dim: 257
32
+ spk_emb_dim: 192
33
+ spk_quant_f0_dim: 64
34
+ segment_length: 16080
35
+ pad_short: 2000
36
+ filter_length: 1024
37
+ hop_length: 256
38
+ win_length: 1024
39
+ wav2vec_hop_length: 320
40
+ sampling_rate: 16000
41
+ mel_fmin: 0.0
42
+ mel_fmax: 8000.0
43
+ #############################
44
+ gen:
45
+ noise_dim: 50
46
+ channel_size: 16
47
+ dilations: [1, 3, 9, 27]
48
+ strides: [8, 8, 5]
49
+ lReLU_slope: 0.2
50
+ kpnet_conv_size: 3
51
+ #############################
52
+ ssc:
53
+ se:
54
+ spk_emb_dim: 512
55
+ num_filters: [16, 32, 64, 128]
56
+ layers: [3, 4, 6, 3]
57
+ pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
58
+ stft_annealing_step: 2000
59
+ pos_ssc_lamb: 0.9
60
+ neg_ssc_lamb: 0.0
61
+ ssc_annealing_step: 2000
62
+ num_ssc_samples: 8
63
+ finetune_epochs: 3
64
+ #############################
65
+ mpd:
66
+ periods: [2, 3, 5, 7, 11]
67
+ kernel_size: 5
68
+ stride: 3
69
+ use_spectral_norm: False
70
+ lReLU_slope: 0.2
71
+ #############################
72
+ mrd:
73
+ resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
74
+ use_spectral_norm: False
75
+ lReLU_slope: 0.2
76
+ #############################
77
+ log:
78
+ summary_interval: 10
79
+ validation_interval: 1
80
+ save_interval: 1
81
+ num_audio: 5
82
+ chkpt_dir: 'chkpt'
83
+ log_dir: 'logs'
84
+ ssc_validation_interval_steps: 400
config/config_wav2vec_ecapa_c32.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ root_dir: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus' # root path of train data (either relative/absolute path is ok)
3
+ wav_dir: 'wav16'
4
+ spect_dir: None
5
+ f0_norm_dir: 'f0_norm_wav2vec'
6
+ avg_speaker_embs_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_avg_embs.pkl'
7
+ speaker_embs_gmm_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/ecapa_tdnn_emb_gmms.pkl'
8
+ seen_speakers_train_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_train_utts.pkl'
9
+ seen_speakers_test_utts: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/seen_speakers_test_utts.pkl'
10
+ f0_metadata_file: '/u/wjkang/data/VCTK-Corpus/VCTK-Corpus/metadata/speaker_f0_metadata.pkl'
11
+ #############################
12
+ train:
13
+ num_workers: 8
14
+ num_gpus: 1
15
+ batch_size: 32
16
+ optimizer: 'adam'
17
+ seed: 1234
18
+ adam:
19
+ lr: 0.0001
20
+ beta1: 0.5
21
+ beta2: 0.9
22
+ stft_lamb: 2.5
23
+ use_wav2vec: True
24
+ use_gmm_emb: True
25
+ warp_lq: False
26
+ use_ssc: False
27
+ #############################
28
+ audio:
29
+ feat_dim: 1024
30
+ n_mel_channels: 80
31
+ f0_norm_dim: 257
32
+ spk_emb_dim: 192
33
+ spk_quant_f0_dim: 64
34
+ segment_length: 16080
35
+ pad_short: 2000
36
+ filter_length: 1024
37
+ hop_length: 256
38
+ win_length: 1024
39
+ wav2vec_hop_length: 320
40
+ sampling_rate: 16000
41
+ mel_fmin: 0.0
42
+ mel_fmax: 8000.0
43
+ #############################
44
+ gen:
45
+ noise_dim: 50
46
+ channel_size: 32
47
+ dilations: [1, 3, 9, 27]
48
+ strides: [8, 8, 5]
49
+ lReLU_slope: 0.2
50
+ kpnet_conv_size: 3
51
+ #############################
52
+ ssc:
53
+ se:
54
+ spk_emb_dim: 512
55
+ num_filters: [16, 32, 64, 128]
56
+ layers: [3, 4, 6, 3]
57
+ pretrained_weight_path: "./weights/resnet34sel_pretrained.pt"
58
+ stft_annealing_step: 2000
59
+ pos_ssc_lamb: 0.9
60
+ neg_ssc_lamb: 0.0
61
+ ssc_annealing_step: 2000
62
+ num_ssc_samples: 8
63
+ finetune_epochs: 3
64
+ #############################
65
+ mpd:
66
+ periods: [2, 3, 5, 7, 11]
67
+ kernel_size: 5
68
+ stride: 3
69
+ use_spectral_norm: False
70
+ lReLU_slope: 0.2
71
+ #############################
72
+ mrd:
73
+ resolutions: "[(5, 25), (10, 50), (2, 10)]" # (hop_length_ms, win_length_ms)
74
+ use_spectral_norm: False
75
+ lReLU_slope: 0.2
76
+ #############################
77
+ log:
78
+ summary_interval: 10
79
+ validation_interval: 1
80
+ save_interval: 1
81
+ num_audio: 5
82
+ chkpt_dir: 'chkpt'
83
+ log_dir: 'logs'
84
+ ssc_validation_interval_steps: 400