yuancwang commited on
Commit
5548515
·
1 Parent(s): 4387736
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +62 -0
  2. config/audioldm.json +92 -0
  3. config/autoencoderkl.json +69 -0
  4. config/base.json +220 -0
  5. config/comosvc.json +216 -0
  6. config/diffusion.json +227 -0
  7. config/fs2.json +118 -0
  8. config/ns2.json +88 -0
  9. config/transformer.json +180 -0
  10. config/tts.json +23 -0
  11. config/valle.json +53 -0
  12. config/vits.json +101 -0
  13. config/vitssvc.json +192 -0
  14. config/vocoder.json +84 -0
  15. egs/datasets/README.md +381 -0
  16. egs/metrics/README.md +94 -0
  17. egs/metrics/run.sh +42 -0
  18. egs/svc/DiffComoSVC/README.md +234 -0
  19. egs/svc/DiffComoSVC/exp_config.json +143 -0
  20. egs/svc/DiffComoSVC/run.sh +1 -0
  21. egs/svc/MultipleContentsSVC/README.md +153 -0
  22. egs/svc/MultipleContentsSVC/exp_config.json +126 -0
  23. egs/svc/MultipleContentsSVC/run.sh +1 -0
  24. egs/svc/README.md +34 -0
  25. egs/svc/TransformerSVC/README.md +164 -0
  26. egs/svc/TransformerSVC/exp_config.json +108 -0
  27. egs/svc/TransformerSVC/run.sh +1 -0
  28. egs/svc/VitsSVC/README.md +125 -0
  29. egs/svc/VitsSVC/exp_config.json +162 -0
  30. egs/svc/VitsSVC/run.sh +1 -0
  31. egs/svc/_template/run.sh +150 -0
  32. egs/tta/README.md +19 -0
  33. egs/tta/RECIPE.md +156 -0
  34. egs/tta/audioldm/exp_config.json +90 -0
  35. egs/tta/audioldm/exp_config_base.json +11 -0
  36. egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
  37. egs/tta/audioldm/run_inference.sh +52 -0
  38. egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
  39. egs/tta/audioldm/run_train.sh +26 -0
  40. egs/tta/audioldm/run_train_latent_4_10_78.sh +26 -0
  41. egs/tta/autoencoderkl/exp_config.json +49 -0
  42. egs/tta/autoencoderkl/exp_config_base.json +11 -0
  43. egs/tta/autoencoderkl/exp_config_latent_4_10_78.json +59 -0
  44. egs/tta/autoencoderkl/run_train.sh +26 -0
  45. egs/tta/autoencoderkl/run_train_latent_4_10_78.sh +26 -0
  46. egs/tts/FastSpeech2/README.md +132 -0
  47. egs/tts/FastSpeech2/exp_config.json +21 -0
  48. egs/tts/FastSpeech2/prepare_mfa.sh +14 -0
  49. egs/tts/FastSpeech2/run.sh +150 -0
  50. egs/tts/NaturalSpeech2/exp_config.json +39 -0
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mac OS files
2
+ .DS_Store
3
+
4
+ # IDEs
5
+ .idea
6
+ .vs
7
+ .vscode
8
+ .cache
9
+
10
+ # GitHub files
11
+ .github
12
+
13
+ # Byte-compiled / optimized / DLL / cached files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.pyc
18
+ .temp
19
+ *.c
20
+ *.so
21
+ *.o
22
+
23
+ # Developing mode
24
+ _*.sh
25
+ _*.json
26
+ *.lst
27
+ yard*
28
+ *.out
29
+ evaluation/evalset_selection
30
+ mfa
31
+ egs/svc/*wavmark
32
+ egs/svc/custom
33
+ egs/svc/*/dev*
34
+ egs/svc/dev_exp_config.json
35
+ bins/svc/demo*
36
+ bins/svc/preprocess_custom.py
37
+ data
38
+ ckpts
39
+
40
+ # Data and ckpt
41
+ *.pkl
42
+ *.pt
43
+ *.npy
44
+ *.npz
45
+ !modules/whisper_extractor/assets/mel_filters.npz
46
+ *.tar.gz
47
+ *.ckpt
48
+ *.wav
49
+ *.flac
50
+ pretrained/wenet/*conformer_exp
51
+
52
+ # Runtime data dirs
53
+ processed_data
54
+ data
55
+ model_ckpt
56
+ logs
57
+ *.ipynb
58
+ *.lst
59
+ source_audio
60
+ result
61
+ conversion_results
62
+ get_available_gpu.py
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": false,
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "data_augment": false,
122
+ "align_mel_duration": false
123
+ },
124
+ "train": {
125
+ "ddp": true,
126
+ "random_seed": 970227,
127
+ "batch_size": 16,
128
+ "max_steps": 1000000,
129
+ // Trackers
130
+ "tracker": [
131
+ "tensorboard"
132
+ // "wandb",
133
+ // "cometml",
134
+ // "mlflow",
135
+ ],
136
+ "max_epoch": -1,
137
+ // -1 means no limit
138
+ "save_checkpoint_stride": [
139
+ 5,
140
+ 20
141
+ ],
142
+ // unit is epoch
143
+ "keep_last": [
144
+ 3,
145
+ -1
146
+ ],
147
+ // -1 means infinite, if one number will broadcast
148
+ "run_eval": [
149
+ false,
150
+ true
151
+ ],
152
+ // if one number will broadcast
153
+ // Fix the random seed
154
+ "random_seed": 10086,
155
+ // Optimizer
156
+ "optimizer": "AdamW",
157
+ "adamw": {
158
+ "lr": 4.0e-4
159
+ // nn model lr
160
+ },
161
+ // LR Scheduler
162
+ "scheduler": "ReduceLROnPlateau",
163
+ "reducelronplateau": {
164
+ "factor": 0.8,
165
+ "patience": 10,
166
+ // unit is epoch
167
+ "min_lr": 1.0e-4
168
+ },
169
+ // Batchsampler
170
+ "sampler": {
171
+ "holistic_shuffle": true,
172
+ "drop_last": true
173
+ },
174
+ // Dataloader
175
+ "dataloader": {
176
+ "num_worker": 32,
177
+ "pin_memory": true
178
+ },
179
+ "gradient_accumulation_step": 1,
180
+ "total_training_steps": 50000,
181
+ "save_summary_steps": 500,
182
+ "save_checkpoints_steps": 10000,
183
+ "valid_interval": 10000,
184
+ "keep_checkpoint_max": 5,
185
+ "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
+ "max_epoch": -1,
187
+ // -1 means no limit
188
+ "save_checkpoint_stride": [
189
+ 5,
190
+ 20
191
+ ],
192
+ // unit is epoch
193
+ "keep_last": [
194
+ 3,
195
+ -1
196
+ ],
197
+ // -1 means infinite, if one number will broadcast
198
+ "run_eval": [
199
+ false,
200
+ true
201
+ ],
202
+ // Batchsampler
203
+ "sampler": {
204
+ "holistic_shuffle": true,
205
+ "drop_last": true
206
+ },
207
+ // Dataloader
208
+ "dataloader": {
209
+ "num_worker": 32,
210
+ "pin_memory": true
211
+ },
212
+ // Trackers
213
+ "tracker": [
214
+ "tensorboard"
215
+ // "wandb",
216
+ // "cometml",
217
+ // "mlflow",
218
+ ],
219
+ },
220
+ }
config/comosvc.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "teacher_model_path": "[Your Teacher Model Path].bin",
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ "comosvc": {
116
+ "distill": false,
117
+ // conformer encoder
118
+ "input_dim": 384,
119
+ "output_dim": 100,
120
+ "n_heads": 2,
121
+ "n_layers": 6,
122
+ "filter_channels": 512,
123
+ "dropout": 0.1,
124
+ // karras diffusion
125
+ "P_mean": -1.2,
126
+ "P_std": 1.2,
127
+ "sigma_data": 0.5,
128
+ "sigma_min": 0.002,
129
+ "sigma_max": 80,
130
+ "rho": 7,
131
+ "n_timesteps": 40,
132
+ },
133
+ "diffusion": {
134
+ // Diffusion steps encoder
135
+ "step_encoder": {
136
+ "dim_raw_embedding": 128,
137
+ "dim_hidden_layer": 512,
138
+ "activation": "SiLU",
139
+ "num_layer": 2,
140
+ "max_period": 10000
141
+ },
142
+ // Diffusion decoder
143
+ "model_type": "bidilconv",
144
+ // bidilconv, unet2d, TODO: unet1d
145
+ "bidilconv": {
146
+ "base_channel": 384,
147
+ "n_res_block": 20,
148
+ "conv_kernel_size": 3,
149
+ "dilation_cycle_length": 4,
150
+ // specially, 1 means no dilation
151
+ "conditioner_size": 100
152
+ }
153
+ },
154
+ },
155
+ "train": {
156
+ // Basic settings
157
+ "fast_steps": 0,
158
+ "batch_size": 32,
159
+ "gradient_accumulation_step": 1,
160
+ "max_epoch": -1,
161
+ // -1 means no limit
162
+ "save_checkpoint_stride": [
163
+ 10,
164
+ 100
165
+ ],
166
+ // unit is epoch
167
+ "keep_last": [
168
+ 3,
169
+ -1
170
+ ],
171
+ // -1 means infinite, if one number will broadcast
172
+ "run_eval": [
173
+ false,
174
+ true
175
+ ],
176
+ // if one number will broadcast
177
+ // Fix the random seed
178
+ "random_seed": 10086,
179
+ // Batchsampler
180
+ "sampler": {
181
+ "holistic_shuffle": true,
182
+ "drop_last": true
183
+ },
184
+ // Dataloader
185
+ "dataloader": {
186
+ "num_worker": 32,
187
+ "pin_memory": true
188
+ },
189
+ // Trackers
190
+ "tracker": [
191
+ "tensorboard"
192
+ // "wandb",
193
+ // "cometml",
194
+ // "mlflow",
195
+ ],
196
+ // Optimizer
197
+ "optimizer": "AdamW",
198
+ "adamw": {
199
+ "lr": 4.0e-4
200
+ // nn model lr
201
+ },
202
+ // LR Scheduler
203
+ "scheduler": "ReduceLROnPlateau",
204
+ "reducelronplateau": {
205
+ "factor": 0.8,
206
+ "patience": 10,
207
+ // unit is epoch
208
+ "min_lr": 1.0e-4
209
+ }
210
+ },
211
+ "inference": {
212
+ "comosvc": {
213
+ "inference_steps": 40
214
+ }
215
+ }
216
+ }
config/diffusion.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // FIXME: THESE ARE LEGACY
3
+ "base_config": "config/base.json",
4
+ "model_type": "diffusion",
5
+ "task_type": "svc",
6
+ "use_custom_dataset": false,
7
+ "preprocess": {
8
+ // data augmentations
9
+ "use_pitch_shift": false,
10
+ "use_formant_shift": false,
11
+ "use_time_stretch": false,
12
+ "use_equalizer": false,
13
+ // acoustic features
14
+ "extract_mel": true,
15
+ "mel_min_max_norm": true,
16
+ "extract_pitch": true,
17
+ "pitch_extractor": "parselmouth",
18
+ "extract_uv": true,
19
+ "extract_energy": true,
20
+ // content features
21
+ "extract_whisper_feature": false,
22
+ "whisper_sample_rate": 16000,
23
+ "extract_contentvec_feature": false,
24
+ "contentvec_sample_rate": 16000,
25
+ "extract_wenet_feature": false,
26
+ "wenet_sample_rate": 16000,
27
+ "extract_mert_feature": false,
28
+ "mert_sample_rate": 16000,
29
+ // Default config for whisper
30
+ "whisper_frameshift": 0.01,
31
+ "whisper_downsample_rate": 2,
32
+ // Default config for content vector
33
+ "contentvec_frameshift": 0.02,
34
+ // Default config for mert
35
+ "mert_model": "m-a-p/MERT-v1-330M",
36
+ "mert_feature_layer": -1,
37
+ "mert_hop_size": 320,
38
+ // 24k
39
+ "mert_frameshit": 0.01333,
40
+ // 10ms
41
+ "wenet_frameshift": 0.01,
42
+ // wenetspeech is 4, gigaspeech is 6
43
+ "wenet_downsample_rate": 4,
44
+ // Default config
45
+ "n_mel": 100,
46
+ "win_size": 1024,
47
+ // todo
48
+ "hop_size": 256,
49
+ "sample_rate": 24000,
50
+ "n_fft": 1024,
51
+ // todo
52
+ "fmin": 0,
53
+ "fmax": 12000,
54
+ // todo
55
+ "f0_min": 50,
56
+ // ~C2
57
+ "f0_max": 1100,
58
+ //1100, // ~C6(1100), ~G5(800)
59
+ "pitch_bin": 256,
60
+ "pitch_max": 1100.0,
61
+ "pitch_min": 50.0,
62
+ "is_label": true,
63
+ "is_mu_law": true,
64
+ "bits": 8,
65
+ "mel_min_max_stats_dir": "mel_min_max_stats",
66
+ "whisper_dir": "whisper",
67
+ "contentvec_dir": "contentvec",
68
+ "wenet_dir": "wenet",
69
+ "mert_dir": "mert",
70
+ // Extract content features using dataloader
71
+ "pin_memory": true,
72
+ "num_workers": 8,
73
+ "content_feature_batch_size": 16,
74
+ // Features used for model training
75
+ "use_mel": true,
76
+ "use_min_max_norm_mel": true,
77
+ "use_frame_pitch": true,
78
+ "use_uv": true,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ // FIXME: FOLLOWING ARE NEW!!
116
+ "diffusion": {
117
+ "scheduler": "ddpm",
118
+ "scheduler_settings": {
119
+ "num_train_timesteps": 1000,
120
+ "beta_start": 1.0e-4,
121
+ "beta_end": 0.02,
122
+ "beta_schedule": "linear"
123
+ },
124
+ // Diffusion steps encoder
125
+ "step_encoder": {
126
+ "dim_raw_embedding": 128,
127
+ "dim_hidden_layer": 512,
128
+ "activation": "SiLU",
129
+ "num_layer": 2,
130
+ "max_period": 10000
131
+ },
132
+ // Diffusion decoder
133
+ "model_type": "bidilconv",
134
+ // bidilconv, unet2d, TODO: unet1d
135
+ "bidilconv": {
136
+ "base_channel": 384,
137
+ "n_res_block": 20,
138
+ "conv_kernel_size": 3,
139
+ "dilation_cycle_length": 4,
140
+ // specially, 1 means no dilation
141
+ "conditioner_size": 384
142
+ },
143
+ "unet2d": {
144
+ "in_channels": 1,
145
+ "out_channels": 1,
146
+ "down_block_types": [
147
+ "CrossAttnDownBlock2D",
148
+ "CrossAttnDownBlock2D",
149
+ "CrossAttnDownBlock2D",
150
+ "DownBlock2D"
151
+ ],
152
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
153
+ "up_block_types": [
154
+ "UpBlock2D",
155
+ "CrossAttnUpBlock2D",
156
+ "CrossAttnUpBlock2D",
157
+ "CrossAttnUpBlock2D"
158
+ ],
159
+ "only_cross_attention": false
160
+ }
161
+ }
162
+ },
163
+ // FIXME: FOLLOWING ARE NEW!!
164
+ "train": {
165
+ // Basic settings
166
+ "batch_size": 64,
167
+ "gradient_accumulation_step": 1,
168
+ "max_epoch": -1,
169
+ // -1 means no limit
170
+ "save_checkpoint_stride": [
171
+ 5,
172
+ 20
173
+ ],
174
+ // unit is epoch
175
+ "keep_last": [
176
+ 3,
177
+ -1
178
+ ],
179
+ // -1 means infinite, if one number will broadcast
180
+ "run_eval": [
181
+ false,
182
+ true
183
+ ],
184
+ // if one number will broadcast
185
+ // Fix the random seed
186
+ "random_seed": 10086,
187
+ // Batchsampler
188
+ "sampler": {
189
+ "holistic_shuffle": true,
190
+ "drop_last": true
191
+ },
192
+ // Dataloader
193
+ "dataloader": {
194
+ "num_worker": 32,
195
+ "pin_memory": true
196
+ },
197
+ // Trackers
198
+ "tracker": [
199
+ "tensorboard"
200
+ // "wandb",
201
+ // "cometml",
202
+ // "mlflow",
203
+ ],
204
+ // Optimizer
205
+ "optimizer": "AdamW",
206
+ "adamw": {
207
+ "lr": 4.0e-4
208
+ // nn model lr
209
+ },
210
+ // LR Scheduler
211
+ "scheduler": "ReduceLROnPlateau",
212
+ "reducelronplateau": {
213
+ "factor": 0.8,
214
+ "patience": 10,
215
+ // unit is epoch
216
+ "min_lr": 1.0e-4
217
+ }
218
+ },
219
+ "inference": {
220
+ "diffusion": {
221
+ "scheduler": "pndm",
222
+ "scheduler_settings": {
223
+ "num_inference_timesteps": 1000
224
+ }
225
+ }
226
+ }
227
+ }
config/fs2.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": true,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_min_max_norm_mel": false,
54
+ "use_frame_pitch": false,
55
+ "use_frame_energy": false,
56
+ "use_phone_pitch": true,
57
+ "use_phone_energy": true,
58
+ "use_log_scale_pitch": false,
59
+ "use_log_scale_energy": false,
60
+ "use_spkid": false,
61
+ "align_mel_duration": true,
62
+ "text_cleaners": ["english_cleaners"],
63
+ "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
64
+ },
65
+ "model": {
66
+ // Settings for transformer
67
+ "transformer": {
68
+ "encoder_layer": 4,
69
+ "encoder_head": 2,
70
+ "encoder_hidden": 256,
71
+ "decoder_layer": 6,
72
+ "decoder_head": 2,
73
+ "decoder_hidden": 256,
74
+ "conv_filter_size": 1024,
75
+ "conv_kernel_size": [9, 1],
76
+ "encoder_dropout": 0.2,
77
+ "decoder_dropout": 0.2
78
+ },
79
+
80
+ // Settings for variance_predictor
81
+ "variance_predictor":{
82
+ "filter_size": 256,
83
+ "kernel_size": 3,
84
+ "dropout": 0.5
85
+ },
86
+ "variance_embedding":{
87
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
88
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
89
+ "n_bins": 256
90
+ },
91
+ "max_seq_len": 1000
92
+ },
93
+ "train":{
94
+ "batch_size": 16,
95
+ "sort_sample": true,
96
+ "drop_last": true,
97
+ "group_size": 4,
98
+ "grad_clip_thresh": 1.0,
99
+ "dataloader": {
100
+ "num_worker": 8,
101
+ "pin_memory": true
102
+ },
103
+ "lr_scheduler":{
104
+ "num_warmup": 4000
105
+ },
106
+ // LR Scheduler
107
+ "scheduler": "NoamLR",
108
+ // Optimizer
109
+ "optimizer": "Adam",
110
+ "adam": {
111
+ "lr": 0.0625,
112
+ "betas": [0.9, 0.98],
113
+ "eps": 0.000000001,
114
+ "weight_decay": 0.0
115
+ },
116
+ }
117
+
118
+ }
config/ns2.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "NaturalSpeech2",
4
+ "dataset": ["LibriTTS"],
5
+ "preprocess": {
6
+ "use_mel": false,
7
+ "use_code": true,
8
+ "use_spkid": true,
9
+ "use_pitch": true,
10
+ "use_duration": true,
11
+ "use_phone": true,
12
+ "use_len": true,
13
+ "use_cross_reference": true,
14
+ "train_file": "train.json",
15
+ "melspec_dir": "mel",
16
+ "code_dir": "code",
17
+ "pitch_dir": "pitch",
18
+ "duration_dir": "duration",
19
+ "clip_mode": "start"
20
+ },
21
+ "model": {
22
+ "latent_dim": 128,
23
+ "prior_encoder": {
24
+ "vocab_size": 100,
25
+ "pitch_min": 50,
26
+ "pitch_max": 1100,
27
+ "pitch_bins_num": 512,
28
+ "encoder": {
29
+ "encoder_layer": 6,
30
+ "encoder_hidden": 512,
31
+ "encoder_head": 8,
32
+ "conv_filter_size": 2048,
33
+ "conv_kernel_size": 9,
34
+ "encoder_dropout": 0.2,
35
+ "use_cln": true
36
+ },
37
+ "duration_predictor": {
38
+ "input_size": 512,
39
+ "filter_size": 512,
40
+ "kernel_size": 3,
41
+ "conv_layers": 30,
42
+ "cross_attn_per_layer": 3,
43
+ "attn_head": 8,
44
+ "drop_out": 0.5
45
+ },
46
+ "pitch_predictor": {
47
+ "input_size": 512,
48
+ "filter_size": 512,
49
+ "kernel_size": 5,
50
+ "conv_layers": 30,
51
+ "cross_attn_per_layer": 3,
52
+ "attn_head": 8,
53
+ "drop_out": 0.5
54
+ }
55
+ },
56
+ "diffusion": {
57
+ "wavenet": {
58
+ "input_size": 128,
59
+ "hidden_size": 512,
60
+ "out_size": 128,
61
+ "num_layers": 40,
62
+ "cross_attn_per_layer": 3,
63
+ "dilation_cycle": 2,
64
+ "attn_head": 8,
65
+ "drop_out": 0.2
66
+ },
67
+ "beta_min": 0.05,
68
+ "beta_max": 20,
69
+ "sigma": 1.0,
70
+ "noise_factor": 1.0,
71
+ "ode_solver": "euler"
72
+ },
73
+ "prompt_encoder": {
74
+ "encoder_layer": 6,
75
+ "encoder_hidden": 512,
76
+ "encoder_head": 8,
77
+ "conv_filter_size": 2048,
78
+ "conv_kernel_size": 9,
79
+ "encoder_dropout": 0.2,
80
+ "use_cln": false
81
+ },
82
+ "query_emb": {
83
+ "query_token_num": 32,
84
+ "hidden_size": 512,
85
+ "head_num": 8
86
+ }
87
+ }
88
+ }
config/transformer.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": true,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "transformer": {
115
+ "type": "conformer",
116
+ // 'conformer' or 'transformer'
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ }
124
+ },
125
+ "train": {
126
+ // Basic settings
127
+ "batch_size": 64,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1,
130
+ // -1 means no limit
131
+ "save_checkpoint_stride": [
132
+ 10,
133
+ 100
134
+ ],
135
+ // unit is epoch
136
+ "keep_last": [
137
+ 3,
138
+ -1
139
+ ],
140
+ // -1 means infinite, if one number will broadcast
141
+ "run_eval": [
142
+ false,
143
+ true
144
+ ],
145
+ // if one number will broadcast
146
+ // Fix the random seed
147
+ "random_seed": 10086,
148
+ // Batchsampler
149
+ "sampler": {
150
+ "holistic_shuffle": true,
151
+ "drop_last": true
152
+ },
153
+ // Dataloader
154
+ "dataloader": {
155
+ "num_worker": 32,
156
+ "pin_memory": true
157
+ },
158
+ // Trackers
159
+ "tracker": [
160
+ "tensorboard"
161
+ // "wandb",
162
+ // "cometml",
163
+ // "mlflow",
164
+ ],
165
+ // Optimizer
166
+ "optimizer": "AdamW",
167
+ "adamw": {
168
+ "lr": 4.0e-4
169
+ // nn model lr
170
+ },
171
+ // LR Scheduler
172
+ "scheduler": "ReduceLROnPlateau",
173
+ "reducelronplateau": {
174
+ "factor": 0.8,
175
+ "patience": 10,
176
+ // unit is epoch
177
+ "min_lr": 1.0e-4
178
+ }
179
+ }
180
+ }
config/tts.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ ],
8
+ "task_type": "tts",
9
+ "preprocess": {
10
+ "language": "en-us",
11
+ // linguistic features
12
+ "extract_phone": true,
13
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
+ // Directory names of processed data or extracted features
16
+ "phone_dir": "phones",
17
+ "use_phone": true,
18
+ },
19
+ "model": {
20
+ "text_token_num": 512,
21
+ }
22
+
23
+ }
config/valle.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sample_rate": 24000,
21
+ "codec_hop_size": 320
22
+ },
23
+ "model": {
24
+ "text_token_num": 512,
25
+ "audio_token_num": 1024,
26
+ "decoder_dim": 1024, // embedding dimension of the decoder model
27
+ "nhead": 16, // number of attention heads in the decoder layers
28
+ "num_decoder_layers": 12, // number of decoder layers
29
+ "norm_first": true, // pre or post Normalization.
30
+ "add_prenet": false, // whether add PreNet after Inputs
31
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
32
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
33
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
34
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
35
+ "num_quantizers": 8, // numbert of the audio quantization layers
36
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
37
+ },
38
+ "train": {
39
+ "ddp": false,
40
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
41
+ "max_epoch": 20,
42
+ "optimizer": "AdamW",
43
+ "scheduler": "cosine",
44
+ "warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
45
+ "base_lr": 1e-4, // base learning rate."
46
+ "valid_interval": 1000,
47
+ "log_epoch_step": 1000,
48
+ "save_checkpoint_stride": [
49
+ 1,
50
+ 1
51
+ ]
52
+ }
53
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vitssvc.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "VITS",
4
+ "task_type": "svc",
5
+ "preprocess": {
6
+ "extract_phone": false,
7
+ "extract_mel": true,
8
+ "extract_linear_spec": true,
9
+ "extract_audio": true,
10
+ "use_linear": true,
11
+ "use_mel": true,
12
+ "use_audio": true,
13
+ "use_text": false,
14
+ "use_phone": true,
15
+
16
+ "fmin": 0,
17
+ "fmax": null,
18
+ "f0_min": 50,
19
+ "f0_max": 1100,
20
+ // f0_bin in sovits
21
+ "pitch_bin": 256,
22
+ // filter_length in sovits
23
+ "n_fft": 2048,
24
+ // hop_length in sovits
25
+ "hop_size": 512,
26
+ // win_length in sovits
27
+ "win_size": 2048,
28
+ "segment_size": 8192,
29
+ "n_mel": 100,
30
+ "sample_rate": 44100,
31
+
32
+ "mel_min_max_stats_dir": "mel_min_max_stats",
33
+ "whisper_dir": "whisper",
34
+ "contentvec_dir": "contentvec",
35
+ "wenet_dir": "wenet",
36
+ "mert_dir": "mert",
37
+ },
38
+ "model": {
39
+ "condition_encoder": {
40
+ "merge_mode": "add",
41
+ "input_melody_dim": 1,
42
+ "use_log_f0": true,
43
+ "n_bins_melody": 256,
44
+ //# Quantization (0 for not quantization)
45
+ "output_melody_dim": 196,
46
+ "input_loudness_dim": 1,
47
+ "use_log_loudness": false,
48
+ "n_bins_loudness": 256,
49
+ "output_loudness_dim": 196,
50
+ "use_whisper": false,
51
+ "use_contentvec": false,
52
+ "use_wenet": false,
53
+ "use_mert": false,
54
+ "whisper_dim": 1024,
55
+ "contentvec_dim": 256,
56
+ "mert_dim": 256,
57
+ "wenet_dim": 512,
58
+ "content_encoder_dim": 196,
59
+ "output_singer_dim": 196,
60
+ "singer_table_size": 512,
61
+ "output_content_dim": 196,
62
+ "use_spkid": true
63
+ },
64
+ "vits": {
65
+ "filter_channels": 256,
66
+ "gin_channels": 256,
67
+ "hidden_channels": 192,
68
+ "inter_channels": 192,
69
+ "kernel_size": 3,
70
+ "n_flow_layer": 4,
71
+ "n_heads": 2,
72
+ "n_layers": 6,
73
+ "n_layers_q": 3,
74
+ "n_speakers": 512,
75
+ "p_dropout": 0.1,
76
+ "ssl_dim": 256,
77
+ "use_spectral_norm": false,
78
+ },
79
+ "generator": "hifigan",
80
+ "generator_config": {
81
+ "hifigan": {
82
+ "resblock": "1",
83
+ "resblock_kernel_sizes": [
84
+ 3,
85
+ 7,
86
+ 11
87
+ ],
88
+ "upsample_rates": [
89
+ 8,8,2,2,2
90
+ ],
91
+ "upsample_kernel_sizes": [
92
+ 16,16,4,4,4
93
+ ],
94
+ "upsample_initial_channel": 512,
95
+ "resblock_dilation_sizes": [
96
+ [1,3,5],
97
+ [1,3,5],
98
+ [1,3,5]
99
+ ]
100
+ },
101
+ "melgan": {
102
+ "ratios": [8, 8, 2, 2, 2],
103
+ "ngf": 32,
104
+ "n_residual_layers": 3,
105
+ "num_D": 3,
106
+ "ndf": 16,
107
+ "n_layers": 4,
108
+ "downsampling_factor": 4
109
+ },
110
+ "bigvgan": {
111
+ "resblock": "1",
112
+ "activation": "snakebeta",
113
+ "snake_logscale": true,
114
+ "upsample_rates": [
115
+ 8,8,2,2,2,
116
+ ],
117
+ "upsample_kernel_sizes": [
118
+ 16,16,4,4,4,
119
+ ],
120
+ "upsample_initial_channel": 512,
121
+ "resblock_kernel_sizes": [
122
+ 3,
123
+ 7,
124
+ 11
125
+ ],
126
+ "resblock_dilation_sizes": [
127
+ [1,3,5],
128
+ [1,3,5],
129
+ [1,3,5]
130
+ ]
131
+ },
132
+ "nsfhifigan": {
133
+ "resblock": "1",
134
+ "harmonic_num": 8,
135
+ "upsample_rates": [
136
+ 8,8,2,2,2,
137
+ ],
138
+ "upsample_kernel_sizes": [
139
+ 16,16,4,4,4,
140
+ ],
141
+ "upsample_initial_channel": 768,
142
+ "resblock_kernel_sizes": [
143
+ 3,
144
+ 7,
145
+ 11
146
+ ],
147
+ "resblock_dilation_sizes": [
148
+ [1,3,5],
149
+ [1,3,5],
150
+ [1,3,5]
151
+ ]
152
+ },
153
+ "apnet": {
154
+ "ASP_channel": 512,
155
+ "ASP_resblock_kernel_sizes": [3,7,11],
156
+ "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
157
+ "ASP_input_conv_kernel_size": 7,
158
+ "ASP_output_conv_kernel_size": 7,
159
+
160
+ "PSP_channel": 512,
161
+ "PSP_resblock_kernel_sizes": [3,7,11],
162
+ "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
163
+ "PSP_input_conv_kernel_size": 7,
164
+ "PSP_output_R_conv_kernel_size": 7,
165
+ "PSP_output_I_conv_kernel_size": 7,
166
+ }
167
+ },
168
+ },
169
+ "train": {
170
+ "fp16_run": true,
171
+ "learning_rate": 2e-4,
172
+ "betas": [
173
+ 0.8,
174
+ 0.99
175
+ ],
176
+ "eps": 1e-9,
177
+ "batch_size": 16,
178
+ "lr_decay": 0.999875,
179
+ // "segment_size": 8192,
180
+ "init_lr_ratio": 1,
181
+ "warmup_epochs": 0,
182
+ "c_mel": 45,
183
+ "c_kl": 1.0,
184
+ "AdamW": {
185
+ "betas": [
186
+ 0.8,
187
+ 0.99
188
+ ],
189
+ "eps": 1e-9,
190
+ }
191
+ }
192
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 4,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }
egs/datasets/README.md ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Format
2
+
3
+ Amphion support the following academic datasets (sort alphabetically):
4
+
5
+ - [Datasets Format](#datasets-format)
6
+ - [AudioCaps](#audiocaps)
7
+ - [CSD](#csd)
8
+ - [KiSing](#kising)
9
+ - [LibriTTS](#libritts)
10
+ - [LJSpeech](#ljspeech)
11
+ - [M4Singer](#m4singer)
12
+ - [NUS-48E](#nus-48e)
13
+ - [Opencpop](#opencpop)
14
+ - [OpenSinger](#opensinger)
15
+ - [Opera](#opera)
16
+ - [PopBuTFy](#popbutfy)
17
+ - [PopCS](#popcs)
18
+ - [PJS](#pjs)
19
+ - [SVCC](#svcc)
20
+ - [VCTK](#vctk)
21
+
22
+ The downloading link and the file structure tree of each dataset is displayed as follows.
23
+
24
+ ## AudioCaps
25
+
26
+ AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
27
+
28
+ ```plaintext
29
+ [AudioCaps dataset path]
30
+ ┣ AudioCpas
31
+ ┃   ┣ wav
32
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
33
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
34
+ ┃ ┃ ┣ ...
35
+ ```
36
+
37
+ ## CSD
38
+
39
+ The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
40
+
41
+ ```plaintext
42
+ [CSD dataset path]
43
+ ┣ english
44
+ ┣ korean
45
+ ┣ utterances
46
+ ┃ ┣ en001a
47
+ ┃ ┃ ┣ {UtterenceID}.wav
48
+ ┃ ┣ en001b
49
+ ┃ ┣ en002a
50
+ ┃ ┣ en002b
51
+ ┃ ┣ ...
52
+ ┣ README
53
+ ```
54
+
55
+ ## KiSing
56
+
57
+ The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
58
+
59
+ ```plaintext
60
+ [KiSing dataset path]
61
+ ┣ clean
62
+ ┃ ┣ 421
63
+ ┃ ┣ 422
64
+ ┃ ┣ ...
65
+ ```
66
+
67
+ ## LibriTTS
68
+
69
+ The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
70
+
71
+ ```plaintext
72
+ [LibriTTS dataset path]
73
+ ┣ BOOKS.txt
74
+ ┣ CHAPTERS.txt
75
+ ┣ eval_sentences10.tsv
76
+ ┣ LICENSE.txt
77
+ ┣ NOTE.txt
78
+ ┣ reader_book.tsv
79
+ ┣ README_librispeech.txt
80
+ ┣ README_libritts.txt
81
+ ┣ speakers.tsv
82
+ ┣ SPEAKERS.txt
83
+ ┣ dev-clean (Subset)
84
+ ┃ ┣ 1272{Speaker_ID}
85
+ ┃ ┃ ┣ 128104 {Chapter_ID}
86
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
87
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
88
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
89
+ ┃ ┃ ┃ ┣ ...
90
+ ┃ ┃ ┃ ┣ 1272_128104.book.tsv
91
+ ┃ ┃ ┃ ┣ 1272_128104.trans.tsv
92
+ ┃ ┃ ┣ ...
93
+ ┃ ┣ ...
94
+ ┣ dev-other (Subset)
95
+ ┃ ┣ 116 (Speaker)
96
+ ┃ ┃ ┣ 288045 {Chapter_ID}
97
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
98
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
99
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
100
+ ┃ ┃ ┃ ┣ ...
101
+ ┃ ┃ ┃ ┣ 116_288045.book.tsv
102
+ ┃ ┃ ┃ ┣ 116_288045.trans.tsv
103
+ ┃ ┃ ┣ ...
104
+ ┃ ┣ ...
105
+ ┃ ┣ ...
106
+ ┣ test-clean (Subset)
107
+ ┃ ┣ {Speaker_ID}
108
+ ┃ ┃ ┣ {Chapter_ID}
109
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
110
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
111
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
112
+ ┃ ┃ ┃ ┣ ...
113
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
114
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
115
+ ┃ ┃ ┣ ...
116
+ ┃ ┣ ...
117
+ ┣ test-other
118
+ ┃ ┣ {Speaker_ID}
119
+ ┃ ┃ ┣ {Chapter_ID}
120
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
121
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
122
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
123
+ ┃ ┃ ┃ ┣ ...
124
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
125
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
126
+ ┃ ┃ ┣ ...
127
+ ┃ ┣ ...
128
+ ┣ train-clean-100
129
+ ┃ ┣ {Speaker_ID}
130
+ ┃ ┃ ┣ {Chapter_ID}
131
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
132
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
133
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
134
+ ┃ ┃ ┃ ┣ ...
135
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
136
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
137
+ ┃ ┃ ┣ ...
138
+ ┃ ┣ ...
139
+ ┣ train-clean-360
140
+ ┃ ┣ {Speaker_ID}
141
+ ┃ ┃ ┣ {Chapter_ID}
142
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
143
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
144
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
145
+ ┃ ┃ ┃ ┣ ...
146
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
147
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
148
+ ┃ ┃ ┣ ...
149
+ ┃ ┣ ...
150
+ ┣ train-other-500
151
+ ┃ ┣ {Speaker_ID}
152
+ ┃ ┃ ┣ {Chapter_ID}
153
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
154
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
155
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
156
+ ┃ ┃ ┃ ┣ ...
157
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
158
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
159
+ ┃ ┃ ┣ ...
160
+ ┃ ┣ ...
161
+ ```
162
+
163
+
164
+ ## LJSpeech
165
+
166
+ The official LJSpeech dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
167
+
168
+ ```plaintext
169
+ [LJSpeech dataset path]
170
+ ┣ metadata.csv
171
+ ┣ wavs
172
+ ┃ ┣ LJ001-0001.wav
173
+ ┃ ┣ LJ001-0002.wav
174
+ ┃ ┣ ...
175
+ ┣ README
176
+ ```
177
+
178
+ ## M4Singer
179
+
180
+ The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
181
+
182
+ ```plaintext
183
+ [M4Singer dataset path]
184
+ ┣ {Singer_1}#{Song_1}
185
+ ┃ ┣ 0000.mid
186
+ ┃ ┣ 0000.TextGrid
187
+ ┃ ┣ 0000.wav
188
+ ┃ ┣ ...
189
+ ┣ {Singer_1}#{Song_2}
190
+ ┣ ...
191
+ ┣ {Singer_2}#{Song_1}
192
+ ┣ {Singer_2}#{Song_2}
193
+ ┣ ...
194
+ ┗ meta.json
195
+ ```
196
+
197
+ ## NUS-48E
198
+
199
+ The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
200
+
201
+ ```plaintext
202
+ [NUS-48E dataset path]
203
+ ┣ {SpeakerID}
204
+ ┃ ┣ read
205
+ ┃ ┃ ┣ {SongID}.txt
206
+ ┃ ┃ ┣ {SongID}.wav
207
+ ┃ ┃ ┣ ...
208
+ ┃ ┣ sing
209
+ ┃ ┃ ┣ {SongID}.txt
210
+ ┃ ┃ ┣ {SongID}.wav
211
+ ┃ ┃ ┣ ...
212
+ ┣ ...
213
+ ┣ README.txt
214
+
215
+ ```
216
+
217
+ ## Opencpop
218
+
219
+ The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
220
+
221
+ ```plaintext
222
+ [Opencpop dataset path]
223
+ ┣ midis
224
+ ┃ ┣ 2001.midi
225
+ ┃ ┣ 2002.midi
226
+ ┃ ┣ 2003.midi
227
+ ┃ ┣ ...
228
+ ┣ segments
229
+ ┃ ┣ wavs
230
+ ┃ ┃ ┣ 2001000001.wav
231
+ ┃ ┃ ┣ 2001000002.wav
232
+ ┃ ┃ ┣ 2001000003.wav
233
+ ┃ ┃ ┣ ...
234
+ ┃ ┣ test.txt
235
+ ┃ ┣ train.txt
236
+ ┃ ┗ transcriptions.txt
237
+ ┣ textgrids
238
+ ┃ ┣ 2001.TextGrid
239
+ ┃ ┣ 2002.TextGrid
240
+ ┃ ┣ 2003.TextGrid
241
+ ┃ ┣ ...
242
+ ┣ wavs
243
+ ┃ ┣ 2001.wav
244
+ ┃ ┣ 2002.wav
245
+ ┃ ┣ 2003.wav
246
+ ┃ ┣ ...
247
+ ┣ TERMS_OF_ACCESS
248
+ ┗ readme.md
249
+ ```
250
+
251
+ ## OpenSinger
252
+
253
+ The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
254
+
255
+ ```plaintext
256
+ [OpenSinger dataset path]
257
+ ┣ ManRaw
258
+ ┃ ┣ {Singer_1}_{Song_1}
259
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
260
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
261
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
262
+ ┃ ┃ ┣ ...
263
+ ┃ ┣ {Singer_1}_{Song_2}
264
+ ┃ ┣ ...
265
+ ┣ WomanRaw
266
+ ┣ LICENSE
267
+ ┗ README.md
268
+ ```
269
+
270
+ ## Opera
271
+
272
+ The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
273
+
274
+ ```plaintext
275
+ [Opera dataset path]
276
+ ┣ monophonic
277
+ ┃ ┣ chinese
278
+ ┃ ┃ ┣ {Gender}_{SingerID}
279
+ ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
280
+ ┃ ┃ ┃ ┣ ...
281
+ ┃ ┃ ┣ ...
282
+ ┃ ┣ western
283
+ ┣ polyphonic
284
+ ┃ ┣ chinese
285
+ ┃ ┣ western
286
+ ┣ CrossculturalDataSet.xlsx
287
+ ```
288
+
289
+ ## PopBuTFy
290
+
291
+ The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
292
+
293
+ ```plaintext
294
+ [PopBuTFy dataset path]
295
+ ┣ data
296
+ ┃ ┣ {SingerID}#singing#{SongName}_Amateur
297
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
298
+ ┃ ┃ ┣ ...
299
+ ┃ ┣ {SingerID}#singing#{SongName}_Professional
300
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
301
+ ┃ ┃ ┣ ...
302
+ ┣ text_labels
303
+ ┗ TERMS_OF_ACCESS
304
+ ```
305
+
306
+ ## PopCS
307
+
308
+ The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
309
+
310
+ ```plaintext
311
+ [PopCS dataset path]
312
+ ┣ popcs
313
+ ┃ ┣ popcs-{SongName}
314
+ ┃ ┃ ┣ {UtteranceID}_ph.txt
315
+ ┃ ┃ ┣ {UtteranceID}_wf0.wav
316
+ ┃ ┃ ┣ {UtteranceID}.TextGrid
317
+ ┃ ┃ ┣ {UtteranceID}.txt
318
+ ┃ ┃ ┣ ...
319
+ ┃ ┣ ...
320
+ ┗ TERMS_OF_ACCESS
321
+ ```
322
+
323
+ ## PJS
324
+
325
+ The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
326
+
327
+ ```plaintext
328
+ [PJS dataset path]
329
+ ┣ PJS_corpus_ver1.1
330
+ ┃ ┣ background_noise
331
+ ┃ ┣ pjs{SongID}
332
+ ┃ ┃ ┣ pjs{SongID}_song.wav
333
+ ┃ ┃ ┣ pjs{SongID}_speech.wav
334
+ ┃ ┃ ┣ pjs{SongID}.lab
335
+ ┃ ┃ ┣ pjs{SongID}.mid
336
+ ┃ ┃ ┣ pjs{SongID}.musicxml
337
+ ┃ ┃ ┣ pjs{SongID}.txt
338
+ ┃ ┣ ...
339
+ ```
340
+
341
+ ## SVCC
342
+
343
+ The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
344
+
345
+ ```plaintext
346
+ [SVCC dataset path]
347
+ ┣ Data
348
+ ┃ ┣ CDF1
349
+ ┃ ┃ ┣ 10001.wav
350
+ ┃ ┃ ┣ 10002.wav
351
+ ┃ ┃ ┣ ...
352
+ ┃ ┣ CDM1
353
+ ┃ ┣ IDF1
354
+ ┃ ┣ IDM1
355
+ ┗ README.md
356
+ ```
357
+
358
+ ## VCTK
359
+
360
+ The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
361
+
362
+ ```plaintext
363
+ [VCTK dataset path]
364
+ ┣ txt
365
+ ┃ ┣ {Speaker_1}
366
+ ┃ ┃ ┣ {Speaker_1}_001.txt
367
+ ┃ ┃ ┣ {Speaker_1}_002.txt
368
+ ┃ ┃ ┣ ...
369
+ ┃ ┣ {Speaker_2}
370
+ ┃ ┣ ...
371
+ ┣ wav48_silence_trimmed
372
+ ┃ ┣ {Speaker_1}
373
+ ┃ ┃ ┣ {Speaker_1}_001_mic1.flac
374
+ ┃ ┃ ┣ {Speaker_1}_001_mic2.flac
375
+ ┃ ┃ ┣ {Speaker_1}_002_mic1.flac
376
+ ┃ ┃ ┣ ...
377
+ ┃ ┣ {Speaker_2}
378
+ ┃ ┣ ...
379
+ ┣ speaker-info.txt
380
+ ┗ update.txt
381
+ ```
egs/metrics/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Evaluation Recipe
2
+
3
+ ## Supported Evaluation Metrics
4
+
5
+ Until now, Amphion Evaluation has supported the following objective metrics:
6
+
7
+ - **F0 Modeling**:
8
+ - F0 Pearson Coefficients (FPC)
9
+ - F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
10
+ - F0 Root Mean Square Error (F0RMSE)
11
+ - Voiced/Unvoiced F1 Score (V/UV F1)
12
+ - **Energy Modeling**:
13
+ - Energy Root Mean Square Error (EnergyRMSE)
14
+ - Energy Pearson Coefficients (EnergyPC)
15
+ - **Intelligibility**:
16
+ - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
17
+ - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
18
+ - **Spectrogram Distortion**:
19
+ - Frechet Audio Distance (FAD)
20
+ - Mel Cepstral Distortion (MCD)
21
+ - Multi-Resolution STFT Distance (MSTFT)
22
+ - Perceptual Evaluation of Speech Quality (PESQ)
23
+ - Short Time Objective Intelligibility (STOI)
24
+ - Scale Invariant Signal to Distortion Ratio (SISDR)
25
+ - Scale Invariant Signal to Noise Ratio (SISNR)
26
+ - **Speaker Similarity**:
27
+ - Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
28
+ - Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨‍💻 developing)
29
+
30
+ We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
31
+
32
+ 1. Pretrained Models Preparation
33
+ 2. Audio Data Preparation
34
+ 3. Evaluation
35
+
36
+ ## 1. Pretrained Models Preparation
37
+
38
+ If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
39
+
40
+ ## 2. Aduio Data Preparation
41
+
42
+ Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
43
+
44
+ ```plaintext
45
+ ┣ {ref_dir}
46
+ ┃ ┣ sample1.wav
47
+ ┃ ┣ sample2.wav
48
+ ┣ {gen_dir}
49
+ ┃ ┣ sample1.wav
50
+ ┃ ┣ sample2.wav
51
+ ```
52
+
53
+ You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
54
+
55
+ ## 3. Evaluation
56
+
57
+ Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
58
+
59
+ ```bash
60
+ cd Amphion
61
+ sh egs/metrics/run.sh \
62
+ --reference_folder [Your path to the reference audios] \
63
+ --generated_folder [Your path to the generated audios] \
64
+ --dump_folder [Your path to dump the objective results] \
65
+ --metrics [The metrics you need] \
66
+ --fs [Optional. To calculate all metrics in the specified sampling rate]
67
+ ```
68
+
69
+ As for the metrics, an example is provided below:
70
+
71
+ ```bash
72
+ --metrics "mcd pesq fad"
73
+ ```
74
+
75
+ All currently available metrics keywords are listed below:
76
+
77
+ | Keys | Description |
78
+ | --------------------- | ------------------------------------------ |
79
+ | `fpc` | F0 Pearson Coefficients |
80
+ | `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
81
+ | `f0rmse` | F0 Root Mean Square Error |
82
+ | `v_uv_f1` | Voiced/Unvoiced F1 Score |
83
+ | `energy_rmse` | Energy Root Mean Square Error |
84
+ | `energy_pc` | Energy Pearson Coefficients |
85
+ | `cer` | Character Error Rate |
86
+ | `wer` | Word Error Rate |
87
+ | `speaker_similarity` | Cos Similarity based on RawNet3 |
88
+ | `fad` | Frechet Audio Distance |
89
+ | `mcd` | Mel Cepstral Distortion |
90
+ | `mstft` | Multi-Resolution STFT Distance |
91
+ | `pesq` | Perceptual Evaluation of Speech Quality |
92
+ | `si_sdr` | Scale Invariant Signal to Distortion Ratio |
93
+ | `si_snr` | Scale Invariant Signal to Noise Ratio |
94
+ | `stoi` | Short Time Objective Intelligibility |
egs/metrics/run.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $exp_dir))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Reference Audio Folder
21
+ --reference_folder) shift; ref_dir=$1 ; shift ;;
22
+ # Generated Audio Folder
23
+ --generated_folder) shift; deg_dir=$1 ; shift ;;
24
+ # Result Dumping Folder
25
+ --dump_folder) shift; dump_dir=$1 ; shift ;;
26
+ # Metrics to Compute
27
+ --metrics) shift; metrics=$1 ; shift ;;
28
+ # Sampling Rate
29
+ --fs) shift; fs=$1 ; shift ;;
30
+
31
+ --) shift ; break ;;
32
+ *) echo "Invalid option: $1" exit 1 ;;
33
+ esac
34
+ done
35
+
36
+ ######## Calculate Objective Metrics ###########
37
+ CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
38
+ --ref_dir $ref_dir \
39
+ --deg_dir $deg_dir \
40
+ --dump_dir $dump_dir \
41
+ --metrics $metrics \
42
+ --fs $fs \
egs/svc/DiffComoSVC/README.md ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
2
+ <br>
3
+ <div align="center">
4
+ <img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
5
+ </div>
6
+ <br>
7
+
8
+ This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
9
+
10
+ * The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
11
+ * To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
12
+
13
+ There are five stages in total:
14
+
15
+ 1. Data preparation
16
+ 2. Features extraction
17
+ 3. Teacher Model Training
18
+ 4. Consistency Distillation
19
+ 5. Inference/conversion
20
+
21
+ ## 1. Data Preparation
22
+
23
+ ### Dataset Download
24
+
25
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
26
+
27
+ ### Configuration
28
+
29
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
30
+
31
+ ```json
32
+ "dataset": [
33
+ "m4singer",
34
+ "opencpop",
35
+ "opensinger",
36
+ "svcc",
37
+ "vctk"
38
+ ],
39
+ "dataset_path": {
40
+ // TODO: Fill in your dataset path
41
+ "m4singer": "[M4Singer dataset path]",
42
+ "opencpop": "[Opencpop dataset path]",
43
+ "opensinger": "[OpenSinger dataset path]",
44
+ "svcc": "[SVCC dataset path]",
45
+ "vctk": "[VCTK dataset path]"
46
+ },
47
+ ```
48
+
49
+ ## 2. Features Extraction
50
+
51
+ ### Content-based Pretrained Models Download
52
+
53
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
54
+
55
+ ### Configuration
56
+
57
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
58
+
59
+ ```json
60
+ // TODO: Fill in the output log path
61
+ "log_dir": "[Your path to save logs and checkpoints]",
62
+ "preprocess": {
63
+ // TODO: Fill in the output data path
64
+ "processed_dir": "[Your path to save processed data]",
65
+ ...
66
+ },
67
+ ```
68
+
69
+ ### Run
70
+
71
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
72
+
73
+ ```bash
74
+ cd Amphion
75
+ sh egs/svc/DiffComoSVC/run.sh --stage 1
76
+ ```
77
+
78
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
79
+
80
+ ## 3. Teacher Model Training
81
+
82
+ ### Configuration
83
+
84
+ Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
85
+
86
+ ```JSON
87
+ "comosvc":{
88
+ "distill": false,
89
+ // conformer encoder
90
+ "input_dim": 384,
91
+ "output_dim": 100,
92
+ "n_heads": 2,
93
+ "n_layers": 6,
94
+ "filter_channels":512,
95
+ // karras diffusion
96
+ "P_mean": -1.2,
97
+ "P_std": 1.2,
98
+ "sigma_data": 0.5,
99
+ "sigma_min": 0.002,
100
+ "sigma_max": 80,
101
+ "rho": 7,
102
+ "n_timesteps": 40,
103
+ },
104
+ ```
105
+
106
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
107
+
108
+ ```json
109
+ "train": {
110
+ "batch_size": 32,
111
+ ...
112
+ "adamw": {
113
+ "lr": 2.0e-4
114
+ },
115
+ ...
116
+ }
117
+ ```
118
+
119
+ ### Run
120
+
121
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
122
+
123
+ ```bash
124
+ cd Amphion
125
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
126
+ ```
127
+
128
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
129
+
130
+ ```bash
131
+ cd Amphion
132
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
133
+ ```
134
+
135
+ ## 4. Consistency Distillation
136
+
137
+ ### Configuration
138
+
139
+ Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
140
+
141
+ ```JSON
142
+ "model": {
143
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
144
+ ...
145
+ "comosvc":{
146
+ "distill": true,
147
+ // conformer encoder
148
+ "input_dim": 384,
149
+ "output_dim": 100,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "filter_channels":512,
153
+ // karras diffusion
154
+ "P_mean": -1.2,
155
+ "P_std": 1.2,
156
+ "sigma_data": 0.5,
157
+ "sigma_min": 0.002,
158
+ "sigma_max": 80,
159
+ "rho": 7,
160
+ "n_timesteps": 40,
161
+ },
162
+ ```
163
+
164
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
165
+
166
+ ```json
167
+ "train": {
168
+ "batch_size": 32,
169
+ ...
170
+ "adamw": {
171
+ "lr": 2.0e-4
172
+ },
173
+ ...
174
+ }
175
+ ```
176
+
177
+ ### Run
178
+
179
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
180
+
181
+ ```bash
182
+ cd Amphion
183
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
184
+ ```
185
+
186
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
187
+
188
+ ```bash
189
+ cd Amphion
190
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
191
+ ```
192
+
193
+ ## 5. Inference/Conversion
194
+
195
+ ### Pretrained Vocoder Download
196
+
197
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
198
+
199
+ ### Run
200
+
201
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
202
+
203
+ | Parameters | Description | Example |
204
+ | --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
205
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
206
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
207
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
208
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
209
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
210
+
211
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
212
+
213
+ ```bash
214
+ cd Amphion
215
+ sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
216
+ --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
217
+ --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
218
+ --infer_source_audio_dir [Your Audios Folder] \
219
+ --infer_target_speaker "opencpop_female1" \
220
+ --infer_key_shift "autoshift"
221
+ ```
222
+ Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
223
+ ```json
224
+ "inference": {
225
+ "comosvc": {
226
+ "inference_steps": 40
227
+ }
228
+ }
229
+ ```
230
+
231
+ # Reference
232
+ https://github.com/zhenye234/CoMoSpeech
233
+
234
+ https://github.com/openai/consistency_models
egs/svc/DiffComoSVC/exp_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/comosvc.json",
3
+ "model_type": "DiffComoSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path
20
+ "log_dir": "[Your path to save logs and checkpoints]",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path
23
+ "processed_dir": "[Your path to save processed data]",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
53
+ "condition_encoder": {
54
+ // Config for features usage
55
+ "use_whisper": true,
56
+ "use_contentvec": true,
57
+ "use_wenet": false,
58
+ "whisper_dim": 1024,
59
+ "contentvec_dim": 256,
60
+ "wenet_dim": 512,
61
+ "use_singer_encoder": false,
62
+ "pitch_min": 50,
63
+ "pitch_max": 1100
64
+ },
65
+ "comosvc":{
66
+ "distill": false,
67
+ // conformer encoder
68
+ "input_dim": 384,
69
+ "output_dim": 100,
70
+ "n_heads": 2,
71
+ "n_layers": 6,
72
+ "filter_channels":512,
73
+ "dropout":0.1,
74
+ // karras diffusion
75
+ "P_mean": -1.2,
76
+ "P_std": 1.2,
77
+ "sigma_data": 0.5,
78
+ "sigma_min": 0.002,
79
+ "sigma_max": 80,
80
+ "rho": 7,
81
+ "n_timesteps": 40,
82
+ },
83
+ "diffusion": {
84
+ // Diffusion steps encoder
85
+ "step_encoder": {
86
+ "dim_raw_embedding": 128,
87
+ "dim_hidden_layer": 512,
88
+ "activation": "SiLU",
89
+ "num_layer": 2,
90
+ "max_period": 10000
91
+ },
92
+ // Diffusion decoder
93
+ "model_type": "bidilconv",
94
+ // bidilconv, unet2d, TODO: unet1d
95
+ "bidilconv": {
96
+ "base_channel": 384,
97
+ "n_res_block": 20,
98
+ "conv_kernel_size": 3,
99
+ "dilation_cycle_length": 4,
100
+ // specially, 1 means no dilation
101
+ "conditioner_size": 100
102
+ }
103
+ }
104
+ },
105
+ "train": {
106
+ "batch_size": 64,
107
+ "gradient_accumulation_step": 1,
108
+ "max_epoch": -1, // -1 means no limit
109
+ "save_checkpoint_stride": [
110
+ 50,
111
+ 50
112
+ ],
113
+ "keep_last": [
114
+ 5,
115
+ -1
116
+ ],
117
+ "run_eval": [
118
+ false,
119
+ true
120
+ ],
121
+ "adamw": {
122
+ "lr": 4.0e-4
123
+ },
124
+ "reducelronplateau": {
125
+ "factor": 0.8,
126
+ "patience": 10,
127
+ "min_lr": 1.0e-4
128
+ },
129
+ "dataloader": {
130
+ "num_worker": 8,
131
+ "pin_memory": true
132
+ },
133
+ "sampler": {
134
+ "holistic_shuffle": false,
135
+ "drop_last": true
136
+ }
137
+ },
138
+ "inference": {
139
+ "comosvc": {
140
+ "inference_steps": 40
141
+ }
142
+ }
143
+ }
egs/svc/DiffComoSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
+
14
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
+
18
+ There are four stages in total:
19
+
20
+ 1. Data preparation
21
+ 2. Features extraction
22
+ 3. Training
23
+ 4. Inference/conversion
24
+
25
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
+ > ```bash
27
+ > cd Amphion
28
+ > ```
29
+
30
+ ## 1. Data Preparation
31
+
32
+ ### Dataset Download
33
+
34
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
+
36
+ ### Configuration
37
+
38
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
+
40
+ ```json
41
+ "dataset": [
42
+ "m4singer",
43
+ "opencpop",
44
+ "opensinger",
45
+ "svcc",
46
+ "vctk"
47
+ ],
48
+ "dataset_path": {
49
+ // TODO: Fill in your dataset path
50
+ "m4singer": "[M4Singer dataset path]",
51
+ "opencpop": "[Opencpop dataset path]",
52
+ "opensinger": "[OpenSinger dataset path]",
53
+ "svcc": "[SVCC dataset path]",
54
+ "vctk": "[VCTK dataset path]"
55
+ },
56
+ ```
57
+
58
+ ## 2. Features Extraction
59
+
60
+ ### Content-based Pretrained Models Download
61
+
62
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
+
64
+ ### Configuration
65
+
66
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
+
68
+ ```json
69
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
+ "log_dir": "ckpts/svc",
71
+ "preprocess": {
72
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
73
+ "processed_dir": "data",
74
+ ...
75
+ },
76
+ ```
77
+
78
+ ### Run
79
+
80
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
81
+
82
+ ```bash
83
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
+ ```
85
+
86
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
+
88
+ ## 3. Training
89
+
90
+ ### Configuration
91
+
92
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
+
94
+ ```json
95
+ "train": {
96
+ "batch_size": 32,
97
+ ...
98
+ "adamw": {
99
+ "lr": 2.0e-4
100
+ },
101
+ ...
102
+ }
103
+ ```
104
+
105
+ ### Run
106
+
107
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
+
109
+ ```bash
110
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
+ ```
112
+
113
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
+
115
+ ## 4. Inference/Conversion
116
+
117
+ ### Pretrained Vocoder Download
118
+
119
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
+
121
+ ### Run
122
+
123
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
+
125
+ | Parameters | Description | Example |
126
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
+
133
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
+
135
+ ```bash
136
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
+ --infer_source_audio_dir [Your Audios Folder] \
140
+ --infer_target_speaker "opencpop_female1" \
141
+ --infer_key_shift "autoshift"
142
+ ```
143
+
144
+ ## Citations
145
+
146
+ ```bibtex
147
+ @article{zhang2023leveraging,
148
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
+ journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
+ year={2023}
152
+ }
153
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "diffusion": {
65
+ "scheduler": "ddpm",
66
+ "scheduler_settings": {
67
+ "num_train_timesteps": 1000,
68
+ "beta_start": 1.0e-4,
69
+ "beta_end": 0.02,
70
+ "beta_schedule": "linear"
71
+ },
72
+ // Diffusion steps encoder
73
+ "step_encoder": {
74
+ "dim_raw_embedding": 128,
75
+ "dim_hidden_layer": 512,
76
+ "activation": "SiLU",
77
+ "num_layer": 2,
78
+ "max_period": 10000
79
+ },
80
+ // Diffusion decoder
81
+ "model_type": "bidilconv",
82
+ // bidilconv, unet2d, TODO: unet1d
83
+ "bidilconv": {
84
+ "base_channel": 512,
85
+ "n_res_block": 40,
86
+ "conv_kernel_size": 3,
87
+ "dilation_cycle_length": 4,
88
+ // specially, 1 means no dilation
89
+ "conditioner_size": 384
90
+ }
91
+ }
92
+ },
93
+ "train": {
94
+ "batch_size": 32,
95
+ "gradient_accumulation_step": 1,
96
+ "max_epoch": -1, // -1 means no limit
97
+ "save_checkpoint_stride": [
98
+ 3,
99
+ 50
100
+ ],
101
+ "keep_last": [
102
+ 3,
103
+ 2
104
+ ],
105
+ "run_eval": [
106
+ true,
107
+ true
108
+ ],
109
+ "adamw": {
110
+ "lr": 2.0e-4
111
+ },
112
+ "reducelronplateau": {
113
+ "factor": 0.8,
114
+ "patience": 30,
115
+ "min_lr": 1.0e-4
116
+ },
117
+ "dataloader": {
118
+ "num_worker": 8,
119
+ "pin_memory": true
120
+ },
121
+ "sampler": {
122
+ "holistic_shuffle": false,
123
+ "drop_last": true
124
+ }
125
+ }
126
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/TransformerSVC/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformer for Singing Voice Conversion
2
+
3
+ This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/TransformerSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+ Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
79
+ ```json
80
+ "model": {
81
+ ...
82
+ "transformer":{
83
+ // 'conformer' or 'transformer'
84
+ "type": "conformer",
85
+ "input_dim": 384,
86
+ "output_dim": 100,
87
+ "n_heads": 2,
88
+ "n_layers": 6,
89
+ "filter_channels":512,
90
+ "dropout":0.1,
91
+ }
92
+ }
93
+ ```
94
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
95
+
96
+ ```json
97
+ "train": {
98
+ "batch_size": 32,
99
+ ...
100
+ "adamw": {
101
+ "lr": 2.0e-4
102
+ },
103
+ ...
104
+ }
105
+ ```
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
110
+
111
+ ```bash
112
+ sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
113
+ ```
114
+
115
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
116
+
117
+ ## 4. Inference/Conversion
118
+
119
+ ### Pretrained Vocoder Download
120
+
121
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
122
+
123
+ ### Run
124
+
125
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
126
+
127
+ | Parameters | Description | Example |
128
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
130
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
131
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
132
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
133
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
134
+
135
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
136
+
137
+ ```bash
138
+ cd Amphion
139
+ sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
140
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
141
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
142
+ --infer_source_audio_dir [Your Audios Folder] \
143
+ --infer_target_speaker "opencpop_female1" \
144
+ --infer_key_shift "autoshift"
145
+ ```
146
+
147
+ ## Citations
148
+
149
+ ```bibtex
150
+ @inproceedings{transformer,
151
+ author = {Ashish Vaswani and
152
+ Noam Shazeer and
153
+ Niki Parmar and
154
+ Jakob Uszkoreit and
155
+ Llion Jones and
156
+ Aidan N. Gomez and
157
+ Lukasz Kaiser and
158
+ Illia Polosukhin},
159
+ title = {Attention is All you Need},
160
+ booktitle = {{NIPS}},
161
+ pages = {5998--6008},
162
+ year = {2017}
163
+ }
164
+ ```
egs/svc/TransformerSVC/exp_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/transformer.json",
3
+ "model_type": "TransformerSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "transformer": {
65
+ // 'conformer' or 'transformer'
66
+ "type": "conformer",
67
+ "input_dim": 384,
68
+ "output_dim": 100,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "filter_channels": 512,
72
+ "dropout": 0.1,
73
+ }
74
+ },
75
+ "train": {
76
+ "batch_size": 64,
77
+ "gradient_accumulation_step": 1,
78
+ "max_epoch": -1, // -1 means no limit
79
+ "save_checkpoint_stride": [
80
+ 50,
81
+ 50
82
+ ],
83
+ "keep_last": [
84
+ 5,
85
+ -1
86
+ ],
87
+ "run_eval": [
88
+ false,
89
+ true
90
+ ],
91
+ "adamw": {
92
+ "lr": 4.0e-4
93
+ },
94
+ "reducelronplateau": {
95
+ "factor": 0.8,
96
+ "patience": 10,
97
+ "min_lr": 1.0e-4
98
+ },
99
+ "dataloader": {
100
+ "num_worker": 8,
101
+ "pin_memory": true
102
+ },
103
+ "sampler": {
104
+ "holistic_shuffle": false,
105
+ "drop_last": true
106
+ }
107
+ }
108
+ }
egs/svc/TransformerSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/VitsSVC/README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VITS for Singing Voice Conversion
2
+
3
+ This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/VitsSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+
79
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
80
+
81
+ ```json
82
+ "train": {
83
+ "batch_size": 32,
84
+ ...
85
+ "adamw": {
86
+ "lr": 2.0e-4
87
+ },
88
+ ...
89
+ }
90
+ ```
91
+
92
+ ### Run
93
+
94
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
95
+
96
+ ```bash
97
+ sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
98
+ ```
99
+
100
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
101
+
102
+ ## 4. Inference/Conversion
103
+
104
+ ### Run
105
+
106
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
107
+
108
+ | Parameters | Description | Example |
109
+ | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
110
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
111
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
112
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
113
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
114
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
115
+
116
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
117
+
118
+ ```bash
119
+ sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
120
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
121
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
122
+ --infer_source_audio_dir [Your Audios Folder] \
123
+ --infer_target_speaker "opencpop_female1" \
124
+ --infer_key_shift "autoshift"
125
+ ```
egs/svc/VitsSVC/exp_config.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vitssvc.json",
3
+ "model_type": "VitsSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+
25
+ "f0_min": 50,
26
+ "f0_max": 1100,
27
+ // f0_bin in sovits
28
+ "pitch_bin": 256,
29
+ // filter_length in sovits
30
+ "n_fft": 2048,
31
+ // hop_length in sovits
32
+ "hop_size": 512,
33
+ // win_length in sovits
34
+ "win_size": 2048,
35
+ "segment_size": 8192,
36
+ "n_mel": 100,
37
+ "sample_rate": 44100,
38
+
39
+ // Config for features extraction
40
+ "extract_mel": true,
41
+ "extract_pitch": true,
42
+ "pitch_extractor": "parselmouth",
43
+ "extract_energy": false,
44
+ "extract_uv": true,
45
+ "extract_linear_spec": true,
46
+ "extract_audio": true,
47
+ // contentvec
48
+ "extract_contentvec_feature": true,
49
+ "contentvec_sample_rate": 16000,
50
+ "contentvec_batch_size": 1,
51
+ "contentvec_frameshift": 0.02,
52
+ // whisper
53
+ "extract_whisper_feature": true,
54
+ "whisper_sample_rate": 16000,
55
+ "whisper_frameshift": 0.01,
56
+ "whisper_downsample_rate": 2,
57
+ // Fill in the content-based pretrained model's path
58
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
59
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
60
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
61
+ "whisper_model": "medium",
62
+ "whisper_model_path": "pretrained/whisper/medium.pt",
63
+ // Config for features usage
64
+ "use_mel": true,
65
+ "use_frame_pitch": true,
66
+ "use_uv": true,
67
+ "use_spkid": true,
68
+ "use_contentvec": true,
69
+ "use_whisper": true,
70
+ "use_text": false,
71
+ "use_phone": false,
72
+
73
+ // Extract content features using dataloader
74
+ "pin_memory": true,
75
+ "num_workers": 8,
76
+ "content_feature_batch_size": 16,
77
+ // Meta file
78
+ "train_file": "train.json",
79
+ "valid_file": "test.json",
80
+ "spk2id": "singers.json",
81
+ "utt2spk": "utt2singer"
82
+ },
83
+ "model": {
84
+ "condition_encoder": {
85
+ // Config for features usage
86
+ "merge_mode": "add",
87
+ "input_melody_dim": 1,
88
+ "use_log_f0": true,
89
+ "n_bins_melody": 256,
90
+ //# Quantization (0 for not quantization)
91
+ "output_melody_dim": 192,
92
+
93
+ "use_contentvec": true,
94
+ "use_whisper": true,
95
+ "use_mert": false,
96
+ "use_wenet": false,
97
+ "whisper_dim": 1024,
98
+ "contentvec_dim": 256,
99
+ "content_encoder_dim": 192,
100
+ "output_singer_dim": 192,
101
+ "singer_table_size": 512,
102
+ "output_content_dim": 192,
103
+ "use_spkid": true,
104
+
105
+ "pitch_max": 1100.0,
106
+ "pitch_min": 50.0,
107
+ },
108
+ "vits": {
109
+ "inter_channels": 192,
110
+ "hidden_channels": 192,
111
+ "filter_channels": 256,
112
+ "n_heads": 2,
113
+ "n_layers": 6,
114
+ "kernel_size": 3,
115
+ "p_dropout": 0.1,
116
+ "ssl_dim": 256,
117
+ "n_flow_layer": 4,
118
+ "n_layers_q": 3,
119
+ "gin_channels": 256,
120
+ "n_speakers": 512,
121
+ "use_spectral_norm": false,
122
+ },
123
+ "generator": "nsfhifigan",
124
+ },
125
+ "train": {
126
+ "batch_size": 32,
127
+ "learning_rate": 2e-4,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1, // -1 means no limit
130
+ "save_checkpoint_stride": [
131
+ 3,
132
+ 50
133
+ ],
134
+ "keep_last": [
135
+ 3,
136
+ 2
137
+ ],
138
+ "run_eval": [
139
+ true,
140
+ true
141
+ ],
142
+ "adamw": {
143
+ "lr": 2.0e-4
144
+ },
145
+ "reducelronplateau": {
146
+ "factor": 0.8,
147
+ "patience": 30,
148
+ "min_lr": 1.0e-4
149
+ },
150
+ "dataloader": {
151
+ "num_worker": 8,
152
+ "pin_memory": true
153
+ },
154
+ "sampler": {
155
+ "holistic_shuffle": false,
156
+ "drop_last": true
157
+ }
158
+ },
159
+ "inference": {
160
+ "batch_size": 1,
161
+ }
162
+ }
egs/svc/VitsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/tta/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Text-to-Audio (TTA) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ Until now, Amphion has supported a latent diffusion based text-to-audio model:
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
14
+ </div>
15
+ <br>
16
+
17
+ Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
18
+ 1. Training the VAE which is called `AutoencoderKL` in Amphion.
19
+ 2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.
egs/tta/RECIPE.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text-to-Audio with Latent Diffusion Model
2
+
3
+ This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
4
+
5
+ <br>
6
+ <div align="center">
7
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
8
+ </div>
9
+ <br>
10
+
11
+ We train this latent diffusion model in two stages:
12
+ 1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
13
+ the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
14
+ 1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
15
+
16
+ There are four stages in total for training the text-to-audio model:
17
+
18
+ 1. Data preparation and processing
19
+ 2. Train the VAE model
20
+ 3. Train the latent diffusion model
21
+ 4. Inference
22
+
23
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
24
+ > ```bash
25
+ > cd Amphion
26
+ > ```
27
+
28
+ ## Overview
29
+
30
+ ```sh
31
+ # Train the VAE model
32
+ sh egs/tta/autoencoderkl/run_train.sh
33
+
34
+ # Train the latent diffusion model
35
+ sh egs/tta/audioldm/run_train.sh
36
+
37
+ # Inference
38
+ sh egs/tta/audioldm/run_inference.sh
39
+ ```
40
+
41
+ ## 1. Data preparation and processing
42
+
43
+ ### Dataset Download
44
+
45
+ We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
46
+
47
+ <!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
48
+ <!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
49
+
50
+ ### Data Processing
51
+
52
+ - Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
53
+
54
+ ```json
55
+ {
56
+ "dataset": [
57
+ "AudioCaps"
58
+ ],
59
+ "preprocess": {
60
+ // Specify the output root path to save the processed data
61
+ "processed_dir": "[Your path to save tta dataset]",
62
+ ...
63
+ }
64
+ }
65
+ ```
66
+
67
+ The folder structure of your downloaded data should be similar to:
68
+
69
+ ```plaintext
70
+ .../[Your path to save tta dataset]
71
+ ┣ AudioCpas
72
+ ┃   ┣ wav
73
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
74
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
75
+ ┃ ┃ ┣ ...
76
+ ```
77
+
78
+ - Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
79
+
80
+ - Generate a json file to save the metadata, the json file is like:
81
+
82
+ ```json
83
+ [
84
+ {
85
+ "Dataset": "AudioCaps",
86
+ "Uid": "---1_cCGK4M_0_10000",
87
+ "Caption": "Idling car, train blows horn and passes"
88
+ },
89
+ {
90
+ "Dataset": "AudioCaps",
91
+ "Uid": "---lTs1dxhU_30000_40000",
92
+ "Caption": "A racing vehicle engine is heard passing by"
93
+ },
94
+ ...
95
+ ]
96
+ ```
97
+ - Finally, the folder structure is like:
98
+
99
+ ```plaintext
100
+ .../[Your path to save tta dataset]
101
+ ┣ AudioCpas
102
+ ┃   ┣ wav
103
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
104
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
105
+ ┃ ┃ ┣ ...
106
+ ┃   ┣ mel
107
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.npy
108
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy
109
+ ┃ ┃ ┣ ...
110
+ ┃   ┣ train.json
111
+ ┃   ┣ valid.json
112
+ ┃   ┣ ...
113
+ ```
114
+
115
+ ## 2. Training the VAE Model
116
+
117
+ The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
118
+
119
+ ```sh
120
+ sh egs/tta/autoencoderkl/run_train.sh
121
+ ```
122
+
123
+ ## 3. Training the Latent Diffusion Model
124
+
125
+ The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
126
+
127
+ ```sh
128
+ sh egs/tta/audioldm/run_train.sh
129
+ ```
130
+
131
+ ## 4. Inference
132
+
133
+ Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
134
+
135
+ ```sh
136
+ sh egs/tta/audioldm/run_inference.sh \
137
+ --text "A man is whistling"
138
+ ```
139
+
140
+ ## Citations
141
+
142
+ ```bibtex
143
+ @article{wang2023audit,
144
+ title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
145
+ author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
146
+ journal={NeurIPS 2023},
147
+ year={2023}
148
+ }
149
+
150
+ @article{liu2023audioldm,
151
+ title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
152
+ author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
153
+ journal={Proceedings of the International Conference on Machine Learning},
154
+ year={2023}
155
+ }
156
+ ```
egs/tta/audioldm/exp_config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+ // For example: "/home/TTADataset/processed_data"
10
+
11
+ // feature
12
+ "use_spkid": false,
13
+ "use_uv": false,
14
+ "use_frame_pitch": false,
15
+ "use_phone_pitch": false,
16
+ "use_frame_energy": false,
17
+ "use_phone_energy": false,
18
+ "use_mel": false,
19
+ "use_audio": false,
20
+ "use_label": false,
21
+ "use_one_hot": false,
22
+ // feature for text to audio
23
+ "use_caption": true,
24
+ "use_melspec": true,
25
+ "use_wav": false,
26
+ // feature dir
27
+ "melspec_dir": "mel",
28
+ "wav_dir": "wav"
29
+ },
30
+ // Specify the output root path to save model ckpts and logs
31
+ "log_dir": "ckpts/tta",
32
+ // For example: "/home/TTADataset/processed_data/logs"
33
+
34
+ // model
35
+ "model": {
36
+ "audioldm": {
37
+ "image_size": 32,
38
+ "in_channels": 4,
39
+ "out_channels": 4,
40
+ "model_channels": 256,
41
+ "attention_resolutions": [4, 2, 1],
42
+ "num_res_blocks": 2,
43
+ "channel_mult": [1, 2, 4],
44
+ "num_heads": 8,
45
+ "use_spatial_transformer": true,
46
+ "transformer_depth": 1,
47
+ "context_dim": 768,
48
+ "use_checkpoint": true,
49
+ "legacy": false
50
+ },
51
+ "autoencoderkl": {
52
+ "ch": 128,
53
+ "ch_mult": [1,1,2,2,4],
54
+ "num_res_blocks": 2,
55
+ "in_channels": 1,
56
+ "z_channels": 4,
57
+ "out_ch": 1,
58
+ "double_z": true
59
+ },
60
+ "noise_scheduler": {
61
+ "num_train_timesteps": 1000,
62
+ "beta_start": 0.00085,
63
+ "beta_end": 0.012,
64
+ "beta_schedule": "scaled_linear",
65
+ "clip_sample": false,
66
+ "steps_offset": 1,
67
+ "set_alpha_to_one": false,
68
+ "skip_prk_steps": true,
69
+ "prediction_type": "epsilon"
70
+ },
71
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
72
+ },
73
+
74
+ // train
75
+ "train": {
76
+ "adam": {
77
+ "lr": 5.0e-5
78
+ },
79
+ "ddp": false,
80
+ "random_seed": 12345,
81
+ "batch_size": 12,
82
+ "epochs": 50000,
83
+ "max_steps": 1000000,
84
+ "total_training_steps": 800000,
85
+ "save_summary_steps": 1000,
86
+ "save_checkpoints_steps": 5000,
87
+ "valid_interval": 5000,
88
+ "keep_checkpoint_max": 100
89
+ }
90
+ }
egs/tta/audioldm/exp_config_base.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/audioldm.json",
3
+ "model_type": "AudioLDM",
4
+ "dataset": [
5
+ "AudioCaps"
6
+ ],
7
+ "preprocess": {
8
+ "train_file": "train.json",
9
+ "valid_file": "vaild.json"
10
+ }
11
+ }
egs/tta/audioldm/exp_config_latent_4_10_78.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spkid": false,
12
+ "use_uv": false,
13
+ "use_frame_pitch": false,
14
+ "use_phone_pitch": false,
15
+ "use_frame_energy": false,
16
+ "use_phone_energy": false,
17
+ "use_mel": false,
18
+ "use_audio": false,
19
+ "use_label": false,
20
+ "use_one_hot": false,
21
+ // feature for text to audio
22
+ "use_caption": true,
23
+ "use_melspec": true,
24
+ "use_wav": false,
25
+ // feature dir
26
+ "melspec_dir": "mel",
27
+ "wav_dir": "wav"
28
+ },
29
+ // Specify the output root path to save model ckpts and logs
30
+ "log_dir": "ckpts/tta",
31
+
32
+ // model
33
+ "model": {
34
+ "audioldm": {
35
+ "image_size": 32,
36
+ "in_channels": 4,
37
+ "out_channels": 4,
38
+ "model_channels": 256,
39
+ "attention_resolutions": [4, 2, 1],
40
+ "num_res_blocks": 2,
41
+ "channel_mult": [1, 2, 4],
42
+ "num_heads": 8,
43
+ "use_spatial_transformer": true,
44
+ "transformer_depth": 1,
45
+ "context_dim": 768,
46
+ "use_checkpoint": true,
47
+ "legacy": false
48
+ },
49
+ "autoencoderkl": {
50
+ "ch": 128,
51
+ "ch_mult": [1,2,2,4],
52
+ "num_res_blocks": 2,
53
+ "in_channels": 1,
54
+ "z_channels": 4,
55
+ "out_ch": 1,
56
+ "double_z": true
57
+ },
58
+ "noise_scheduler": {
59
+ "num_train_timesteps": 1000,
60
+ "beta_start": 0.00085,
61
+ "beta_end": 0.012,
62
+ "beta_schedule": "scaled_linear",
63
+ "clip_sample": false,
64
+ "steps_offset": 1,
65
+ "set_alpha_to_one": false,
66
+ "skip_prk_steps": true,
67
+ "prediction_type": "epsilon"
68
+ },
69
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
70
+ },
71
+
72
+ // train
73
+ "train": {
74
+ "adam": {
75
+ "lr": 2.0e-5
76
+ },
77
+ "ddp": false,
78
+ "random_seed": 12345,
79
+ "batch_size": 12,
80
+ "epochs": 50000,
81
+ "max_steps": 1000000,
82
+ "total_training_steps": 800000,
83
+ "save_summary_steps": 1000,
84
+ "save_checkpoints_steps": 5000,
85
+ "valid_interval": 5000,
86
+ "keep_checkpoint_max": 100
87
+ }
88
+ }
egs/tta/audioldm/run_inference.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="$text" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir
egs/tta/audioldm/run_inference_latent_4_10_78.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_v2.json"
16
+ exp_name="audioldm_debug_latent_size_4_10_78"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="A man is whistling" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir \
egs/tta/audioldm/run_train.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/audioldm/run_train_latent_4_10_78.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_latent_4_10_78.json"
16
+ exp_name="audioldm_debug_latent_size_4_10_78"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/autoencoderkl/exp_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/autoencoderkl/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spk": false,
12
+ "use_spkid": false,
13
+ "use_uv": false,
14
+ "use_frame_pitch": false,
15
+ "use_phone_pitch": false,
16
+ "use_frame_energy": false,
17
+ "use_phone_energy": false,
18
+ "use_mel": false,
19
+ "use_audio": false,
20
+ "use_label": false,
21
+ "use_one_hot": false,
22
+ // feature for text to audio
23
+ "use_caption": true,
24
+ "use_melspec": true,
25
+ "use_wav": false,
26
+ // feature dir
27
+ "melspec_dir": "mel",
28
+ "wav_dir": "wav"
29
+ },
30
+ // Specify the output root path to save model ckpts and logs
31
+ "log_dir": "ckpts/tta",
32
+
33
+ // train
34
+ "train": {
35
+ "adam": {
36
+ "lr": 4.0e-5
37
+ },
38
+ "ddp": false,
39
+ "random_seed": 12345,
40
+ "batch_size": 12,
41
+ "epochs": 50000,
42
+ "max_steps": 1000000,
43
+ "total_training_steps": 800000,
44
+ "save_summary_steps": 1000,
45
+ "save_checkpoints_steps": 5000,
46
+ "valid_interval": 5000,
47
+ "keep_checkpoint_max": 100
48
+ }
49
+ }
egs/tta/autoencoderkl/exp_config_base.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/autoencoderkl.json",
3
+ "model_type": "AutoencoderKL",
4
+ "dataset": [
5
+ "AudioCaps"
6
+ ],
7
+ "preprocess": {
8
+ "train_file": "train.json",
9
+ "valid_file": "vaild.json"
10
+ }
11
+ }
egs/tta/autoencoderkl/exp_config_latent_4_10_78.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/autoencoderkl/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spkid": false,
12
+ "use_uv": false,
13
+ "use_frame_pitch": false,
14
+ "use_phone_pitch": false,
15
+ "use_frame_energy": false,
16
+ "use_phone_energy": false,
17
+ "use_mel": false,
18
+ "use_audio": false,
19
+ "use_label": false,
20
+ "use_one_hot": false,
21
+ // feature for text to audio
22
+ "use_caption": true,
23
+ "use_melspec": true,
24
+ "use_wav": false,
25
+ // feature dir
26
+ "melspec_dir": "mel",
27
+ "wav_dir": "wav"
28
+ },
29
+ // Specify the output root path to save model ckpts and logs
30
+ "log_dir": "ckpts/tta",
31
+
32
+ "model": {
33
+ "autoencoderkl": {
34
+ "ch": 128,
35
+ "ch_mult": [1,2,2,4],
36
+ "num_res_blocks": 2,
37
+ "in_channels": 1,
38
+ "z_channels": 4,
39
+ "out_ch": 1,
40
+ "double_z": true
41
+ }
42
+ },
43
+ // train
44
+ "train": {
45
+ "adam": {
46
+ "lr": 4.0e-5
47
+ },
48
+ "ddp": false,
49
+ "random_seed": 12345,
50
+ "batch_size": 12,
51
+ "epochs": 50000,
52
+ "max_steps": 1000000,
53
+ "total_training_steps": 800000,
54
+ "save_summary_steps": 1000,
55
+ "save_checkpoints_steps": 5000,
56
+ "valid_interval": 5000,
57
+ "keep_checkpoint_max": 100
58
+ }
59
+ }
egs/tta/autoencoderkl/run_train.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="autoencoder_kl_debug"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/autoencoderkl/run_train_latent_4_10_78.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_latent_4_10_78.json"
16
+ exp_name="autoencoder_kl_debug_latent_size_4_10_78"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tts/FastSpeech2/README.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FastSpeech2 Recipe
3
+
4
+ In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
5
+
6
+ There are four stages in total:
7
+
8
+ 1. Data preparation
9
+ 2. Features extraction
10
+ 3. Training
11
+ 4. Inference
12
+
13
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
14
+ > ```bash
15
+ > cd Amphion
16
+ > ```
17
+
18
+ ## 1. Data Preparation
19
+
20
+ ### Dataset Download
21
+ You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "LJSpeech",
30
+ ],
31
+ "dataset_path": {
32
+ // TODO: Fill in your dataset path
33
+ "LJSpeech": "[LJSpeech dataset path]",
34
+ },
35
+ ```
36
+
37
+ ## 2. Features Extraction
38
+
39
+ ### Configuration
40
+
41
+ Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
42
+
43
+ ```json
44
+ // TODO: Fill in the output log path
45
+ "log_dir": "ckpts/tts",
46
+ "preprocess": {
47
+ // TODO: Fill in the output data path
48
+ "processed_dir": "data",
49
+ ...
50
+ },
51
+ ```
52
+
53
+ ### Run
54
+
55
+ Run the `run.sh` as the preproces stage (set `--stage 1`):
56
+
57
+ ```bash
58
+ sh egs/tts/FastSpeech2/run.sh --stage 1
59
+ ```
60
+
61
+ ## 3. Training
62
+
63
+ ### Configuration
64
+
65
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
66
+
67
+ ```
68
+ "train": {
69
+ "batch_size": 16,
70
+ }
71
+ ```
72
+
73
+ ### Run
74
+
75
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
76
+
77
+ ```bash
78
+ sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName]
79
+ ```
80
+
81
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
82
+
83
+
84
+ ## 4. Inference
85
+
86
+ ### Configuration
87
+
88
+ For inference, you need to specify the following configurations when running `run.sh`:
89
+
90
+
91
+ | Parameters | Description | Example |
92
+ | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
93
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` |
94
+ | `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` |
95
+ | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
96
+ | `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
97
+ | `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
98
+ | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
99
+
100
+ ### Run
101
+ For example, if you want to generate speech of all testing set split from LJSpeech, just run:
102
+
103
+ ```bash
104
+ sh egs/tts/FastSpeech2/run.sh --stage 3 \
105
+ --infer_expt_dir ckpts/tts/[YourExptName] \
106
+ --infer_output_dir ckpts/tts/[YourExptName]/result \
107
+ --infer_mode "batch" \
108
+ --infer_dataset "LJSpeech" \
109
+ --infer_testing_set "test"
110
+ ```
111
+
112
+ Or, if you want to generate a single clip of speech from a given text, just run:
113
+
114
+ ```bash
115
+ sh egs/tts/FastSpeech2/run.sh --stage 3 \
116
+ --infer_expt_dir ckpts/tts/[YourExptName] \
117
+ --infer_output_dir ckpts/tts/[YourExptName]/result \
118
+ --infer_mode "single" \
119
+ --infer_text "This is a clip of generated speech with the given text from a TTS model."
120
+ ```
121
+
122
+ We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction.
123
+
124
+
125
+ ```bibtex
126
+ @inproceedings{ren2020fastspeech,
127
+ title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
128
+ author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
129
+ booktitle={International Conference on Learning Representations},
130
+ year={2020}
131
+ }
132
+ ```
egs/tts/FastSpeech2/exp_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/fs2.json",
3
+ "model_type": "FastSpeech2",
4
+ "dataset": [
5
+ "LJSpeech"
6
+ ],
7
+ "dataset_path": {
8
+ // TODO: Fill in your dataset path
9
+ "LJSpeech": "[LJSpeech dataset path]"
10
+ },
11
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
12
+ "log_dir": "ckpts/tts",
13
+ "preprocess": {
14
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
15
+ "processed_dir": "data",
16
+ "sample_rate": 22050,
17
+ },
18
+ "train": {
19
+ "batch_size": 16,
20
+ }
21
+ }
egs/tts/FastSpeech2/prepare_mfa.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ #!/bin/bash
7
+ mkdir mfa
8
+ cd mfa
9
+ wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz
10
+ tar -zxvf montreal-forced-aligner_linux.tar.gz
11
+ cd mfa
12
+ mkdir lexicon
13
+ cd lexicon
14
+ wget http://www.openslr.org/resources/11/librispeech-lexicon.txt
egs/tts/FastSpeech2/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ cd $work_dir/modules/monotonic_align
15
+ mkdir -p monotonic_align
16
+ python setup.py build_ext --inplace
17
+ cd $work_dir
18
+
19
+ mfa_dir=$work_dir/mfa
20
+ echo $mfa_dir
21
+
22
+ ######## Parse the Given Parameters from the Commond ###########
23
+ # options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
24
+ options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
25
+ eval set -- "$options"
26
+
27
+ while true; do
28
+ case $1 in
29
+ # Experimental Configuration File
30
+ -c | --config) shift; exp_config=$1 ; shift ;;
31
+ # Experimental Name
32
+ -n | --name) shift; exp_name=$1 ; shift ;;
33
+ # Running Stage
34
+ -s | --stage) shift; running_stage=$1 ; shift ;;
35
+ # Visible GPU machines. The default value is "0".
36
+ --gpu) shift; gpu=$1 ; shift ;;
37
+
38
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
39
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
40
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
41
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
42
+ # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
43
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
44
+ # [Only for Inference] The inference dataset. It is only used when the inference model is "batch".
45
+ --infer_dataset) shift; infer_dataset=$1 ; shift ;;
46
+ # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
47
+ --infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
48
+ # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
49
+ --infer_text) shift; infer_text=$1 ; shift ;;
50
+
51
+ --) shift ; break ;;
52
+ *) echo "Invalid option: $1" exit 1 ;;
53
+ esac
54
+ done
55
+
56
+
57
+ ### Value check ###
58
+ if [ -z "$running_stage" ]; then
59
+ echo "[Error] Please specify the running stage"
60
+ exit 1
61
+ fi
62
+
63
+ if [ -z "$exp_config" ]; then
64
+ exp_config="${exp_dir}"/exp_config.json
65
+ fi
66
+ echo "Exprimental Configuration File: $exp_config"
67
+
68
+ if [ -z "$gpu" ]; then
69
+ gpu="0"
70
+ fi
71
+
72
+ ######## Features Extraction ###########
73
+ if [ $running_stage -eq 1 ]; then
74
+ if [ ! -d "$mfa_dir" ]; then
75
+ bash ${exp_dir}/prepare_mfa.sh
76
+ fi
77
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
78
+ --config=$exp_config \
79
+ --num_workers=4 \
80
+ --prepare_alignment=true
81
+ fi
82
+
83
+ ######## Training ###########
84
+ if [ $running_stage -eq 2 ]; then
85
+ if [ -z "$exp_name" ]; then
86
+ echo "[Error] Please specify the experiments name"
87
+ exit 1
88
+ fi
89
+ echo "Exprimental Name: $exp_name"
90
+
91
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \
92
+ --config $exp_config \
93
+ --exp_name $exp_name \
94
+ --log_level debug
95
+ fi
96
+
97
+ ######## Inference ###########
98
+ if [ $running_stage -eq 3 ]; then
99
+ if [ -z "$infer_expt_dir" ]; then
100
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
101
+ exit 1
102
+ fi
103
+
104
+ if [ -z "$infer_output_dir" ]; then
105
+ infer_output_dir="$expt_dir/result"
106
+ fi
107
+
108
+ if [ -z "$infer_mode" ]; then
109
+ echo "[Error] Please specify the inference mode, e.g., "batch", "single""
110
+ exit 1
111
+ fi
112
+
113
+ if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then
114
+ echo "[Error] Please specify the dataset used in inference when the inference mode is batch"
115
+ exit 1
116
+ fi
117
+
118
+ if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then
119
+ echo "[Error] Please specify the testing set used in inference when the inference mode is batch"
120
+ exit 1
121
+ fi
122
+
123
+ if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
124
+ echo "[Error] Please specify the text to be synthesized when the inference mode is single"
125
+ exit 1
126
+ fi
127
+
128
+ if [ "$infer_mode" = "single" ]; then
129
+ echo 'Text: ' ${infer_text}
130
+ infer_dataset=None
131
+ infer_testing_set=None
132
+ elif [ "$infer_mode" = "batch" ]; then
133
+ infer_text=''
134
+ fi
135
+
136
+
137
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
138
+ --config $exp_config \
139
+ --acoustics_dir $infer_expt_dir \
140
+ --output_dir $infer_output_dir \
141
+ --mode $infer_mode \
142
+ --dataset $infer_dataset \
143
+ --testing_set $infer_testing_set \
144
+ --text "$infer_text" \
145
+ --log_level debug \
146
+ --vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
147
+
148
+
149
+
150
+ fi
egs/tts/NaturalSpeech2/exp_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tts/NaturalSpeech2/exp_config_base.json",
3
+ "dataset": [
4
+ "LibriTTS"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "[LibriTTS dataset path]",
9
+ "train_file": "train.json",
10
+ "valid_file": "test.json",
11
+ "read_metadata": true,
12
+ "metadata_dir": "metadata"
13
+ },
14
+ // Specify the output root path to save model ckpts and logs
15
+ "log_dir": "ckpts/tts",
16
+ "train": {
17
+ // New trainer and Accelerator
18
+ "gradient_accumulation_step": 1,
19
+ "tracker": ["tensorboard"],
20
+ "max_epoch": 5000,
21
+ "save_checkpoint_stride": [1],
22
+ "keep_last": [1000],
23
+ "run_eval": [true],
24
+ "dataloader": {
25
+ "num_worker": 16,
26
+ "pin_memory": true
27
+ },
28
+ "adam": {
29
+ "lr": 1.0e-4
30
+ },
31
+ "use_dynamic_batchsize": true,
32
+ "batch_size": 8,
33
+ "max_tokens": 7500,
34
+ "max_sentences": 32,
35
+ "lr_warmup_steps": 5000,
36
+ "lr_scheduler": "cosine",
37
+ "num_train_steps": 800000
38
+ }
39
+ }