Spaces:
Runtime error
Runtime error
Hecheng0625
commited on
Upload 167 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- Dockerfile +64 -0
- LICENSE +21 -0
- bins/calc_metrics.py +268 -0
- bins/codec/inference.py +99 -0
- bins/codec/train.py +79 -0
- bins/svc/inference.py +265 -0
- bins/svc/preprocess.py +183 -0
- bins/svc/train.py +111 -0
- bins/tta/inference.py +94 -0
- bins/tta/preprocess.py +195 -0
- bins/tta/train_tta.py +77 -0
- bins/tts/inference.py +169 -0
- bins/tts/preprocess.py +244 -0
- bins/tts/train.py +152 -0
- bins/vocoder/inference.py +115 -0
- bins/vocoder/preprocess.py +151 -0
- bins/vocoder/train.py +93 -0
- config/audioldm.json +92 -0
- config/autoencoderkl.json +69 -0
- config/base.json +185 -0
- config/comosvc.json +215 -0
- config/facodec.json +67 -0
- config/fs2.json +120 -0
- config/jets.json +120 -0
- config/ns2.json +88 -0
- config/svc/base.json +119 -0
- config/svc/diffusion.json +142 -0
- config/transformer.json +179 -0
- config/tts.json +25 -0
- config/valle.json +55 -0
- config/vits.json +101 -0
- config/vitssvc.json +306 -0
- config/vocoder.json +84 -0
- egs/codec/FAcodec/README.md +51 -0
- egs/codec/FAcodec/exp_custom_data.json +80 -0
- egs/codec/FAcodec/train.sh +27 -0
- egs/datasets/README.md +458 -0
- egs/datasets/docker.md +19 -0
- egs/metrics/README.md +174 -0
- egs/metrics/run.sh +132 -0
- egs/svc/DiffComoSVC/README.md +234 -0
- egs/svc/DiffComoSVC/exp_config.json +143 -0
- egs/svc/MultipleContentsSVC/README.md +248 -0
- egs/svc/MultipleContentsSVC/exp_config.json +127 -0
- egs/svc/README.md +34 -0
- egs/svc/TransformerSVC/README.md +164 -0
- egs/svc/TransformerSVC/exp_config.json +108 -0
- egs/svc/VitsSVC/README.md +125 -0
- egs/svc/VitsSVC/exp_config.json +106 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
# Other version: https://hub.docker.com/r/nvidia/cuda/tags
|
7 |
+
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04
|
8 |
+
|
9 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
10 |
+
ARG PYTORCH='2.0.0'
|
11 |
+
ARG CUDA='cu118'
|
12 |
+
ARG SHELL='/bin/bash'
|
13 |
+
ARG MINICONDA='Miniconda3-py39_23.3.1-0-Linux-x86_64.sh'
|
14 |
+
|
15 |
+
ENV LANG=en_US.UTF-8 PYTHONIOENCODING=utf-8 PYTHONDONTWRITEBYTECODE=1 CUDA_HOME=/usr/local/cuda CONDA_HOME=/opt/conda SHELL=${SHELL}
|
16 |
+
ENV PATH=$CONDA_HOME/bin:$CUDA_HOME/bin:$PATH \
|
17 |
+
LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH \
|
18 |
+
LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH \
|
19 |
+
CONDA_PREFIX=$CONDA_HOME \
|
20 |
+
NCCL_HOME=$CUDA_HOME
|
21 |
+
|
22 |
+
# Install ubuntu packages
|
23 |
+
RUN sed -i 's/archive.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
|
24 |
+
&& sed -i 's/security.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
|
25 |
+
&& rm /etc/apt/sources.list.d/cuda.list \
|
26 |
+
&& apt-get update \
|
27 |
+
&& apt-get -y install \
|
28 |
+
python3-pip ffmpeg git less wget libsm6 libxext6 libxrender-dev \
|
29 |
+
build-essential cmake pkg-config libx11-dev libatlas-base-dev \
|
30 |
+
libgtk-3-dev libboost-python-dev vim libgl1-mesa-glx \
|
31 |
+
libaio-dev software-properties-common tmux \
|
32 |
+
espeak-ng
|
33 |
+
|
34 |
+
# Install miniconda with python 3.9
|
35 |
+
USER root
|
36 |
+
# COPY Miniconda3-py39_23.3.1-0-Linux-x86_64.sh /root/anaconda.sh
|
37 |
+
RUN wget -t 0 -c -O /tmp/anaconda.sh https://repo.anaconda.com/miniconda/${MINICONDA} \
|
38 |
+
&& mv /tmp/anaconda.sh /root/anaconda.sh \
|
39 |
+
&& ${SHELL} /root/anaconda.sh -b -p $CONDA_HOME \
|
40 |
+
&& rm /root/anaconda.sh
|
41 |
+
|
42 |
+
RUN conda create -y --name amphion python=3.9.15
|
43 |
+
|
44 |
+
WORKDIR /app
|
45 |
+
COPY env.sh env.sh
|
46 |
+
RUN chmod +x ./env.sh
|
47 |
+
|
48 |
+
RUN ["conda", "run", "-n", "amphion", "-vvv", "--no-capture-output", "./env.sh"]
|
49 |
+
|
50 |
+
RUN conda init \
|
51 |
+
&& echo "\nconda activate amphion\n" >> ~/.bashrc
|
52 |
+
|
53 |
+
CMD ["/bin/bash"]
|
54 |
+
|
55 |
+
# *** Build ***
|
56 |
+
# docker build -t realamphion/amphion .
|
57 |
+
|
58 |
+
# *** Run ***
|
59 |
+
# cd Amphion
|
60 |
+
# docker run --runtime=nvidia --gpus all -it -v .:/app -v /mnt:/mnt_host realamphion/amphion
|
61 |
+
|
62 |
+
# *** Push and release ***
|
63 |
+
# docker login
|
64 |
+
# docker push realamphion/amphion
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Amphion
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
bins/calc_metrics.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import numpy as np
|
9 |
+
import json
|
10 |
+
import argparse
|
11 |
+
import whisper
|
12 |
+
import torch
|
13 |
+
|
14 |
+
from glob import glob
|
15 |
+
from tqdm import tqdm
|
16 |
+
from collections import defaultdict
|
17 |
+
|
18 |
+
|
19 |
+
from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
|
20 |
+
from evaluation.metrics.energy.energy_pearson_coefficients import (
|
21 |
+
extract_energy_pearson_coeffcients,
|
22 |
+
)
|
23 |
+
from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
|
24 |
+
from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
|
25 |
+
from evaluation.metrics.f0.f0_rmse import extract_f0rmse
|
26 |
+
from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
|
27 |
+
from evaluation.metrics.intelligibility.character_error_rate import extract_cer
|
28 |
+
from evaluation.metrics.intelligibility.word_error_rate import extract_wer
|
29 |
+
from evaluation.metrics.similarity.speaker_similarity import extract_similarity
|
30 |
+
from evaluation.metrics.spectrogram.frechet_distance import extract_fad
|
31 |
+
from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
|
32 |
+
from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
|
33 |
+
from evaluation.metrics.spectrogram.pesq import extract_pesq
|
34 |
+
from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
|
35 |
+
extract_si_sdr,
|
36 |
+
)
|
37 |
+
from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
|
38 |
+
extract_si_snr,
|
39 |
+
)
|
40 |
+
from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
|
41 |
+
extract_stoi,
|
42 |
+
)
|
43 |
+
|
44 |
+
METRIC_FUNC = {
|
45 |
+
"energy_rmse": extract_energy_rmse,
|
46 |
+
"energy_pc": extract_energy_pearson_coeffcients,
|
47 |
+
"fpc": extract_fpc,
|
48 |
+
"f0_periodicity_rmse": extract_f0_periodicity_rmse,
|
49 |
+
"f0rmse": extract_f0rmse,
|
50 |
+
"v_uv_f1": extract_f1_v_uv,
|
51 |
+
"cer": extract_cer,
|
52 |
+
"wer": extract_wer,
|
53 |
+
"similarity": extract_similarity,
|
54 |
+
"fad": extract_fad,
|
55 |
+
"mcd": extract_mcd,
|
56 |
+
"mstft": extract_mstft,
|
57 |
+
"pesq": extract_pesq,
|
58 |
+
"si_sdr": extract_si_sdr,
|
59 |
+
"si_snr": extract_si_snr,
|
60 |
+
"stoi": extract_stoi,
|
61 |
+
}
|
62 |
+
|
63 |
+
|
64 |
+
def calc_metric(
|
65 |
+
ref_dir,
|
66 |
+
deg_dir,
|
67 |
+
dump_dir,
|
68 |
+
metrics,
|
69 |
+
**kwargs,
|
70 |
+
):
|
71 |
+
result = defaultdict()
|
72 |
+
|
73 |
+
for metric in tqdm(metrics):
|
74 |
+
if metric in ["fad", "similarity"]:
|
75 |
+
result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir, kwargs=kwargs))
|
76 |
+
continue
|
77 |
+
|
78 |
+
audios_ref = []
|
79 |
+
audios_deg = []
|
80 |
+
|
81 |
+
files = glob(deg_dir + "/*.wav")
|
82 |
+
|
83 |
+
for file in files:
|
84 |
+
audios_deg.append(file)
|
85 |
+
uid = file.split("/")[-1].split(".wav")[0]
|
86 |
+
file_gt = ref_dir + "/{}.wav".format(uid)
|
87 |
+
audios_ref.append(file_gt)
|
88 |
+
|
89 |
+
if metric in ["wer", "cer"] and kwargs["intelligibility_mode"] == "gt_content":
|
90 |
+
ltr_path = kwargs["ltr_path"]
|
91 |
+
tmpltrs = {}
|
92 |
+
with open(ltr_path, "r") as f:
|
93 |
+
for line in f:
|
94 |
+
paras = line.replace("\n", "").split("|")
|
95 |
+
paras[1] = paras[1].replace(" ", "")
|
96 |
+
paras[1] = paras[1].replace(".", "")
|
97 |
+
paras[1] = paras[1].replace("'", "")
|
98 |
+
paras[1] = paras[1].replace("-", "")
|
99 |
+
paras[1] = paras[1].replace(",", "")
|
100 |
+
paras[1] = paras[1].replace("!", "")
|
101 |
+
paras[1] = paras[1].lower()
|
102 |
+
tmpltrs[paras[0]] = paras[1]
|
103 |
+
ltrs = []
|
104 |
+
files = glob(ref_dir + "/*.wav")
|
105 |
+
for file in files:
|
106 |
+
ltrs.append(tmpltrs[os.path.basename(file)])
|
107 |
+
|
108 |
+
if metric in ["v_uv_f1"]:
|
109 |
+
tp_total = 0
|
110 |
+
fp_total = 0
|
111 |
+
fn_total = 0
|
112 |
+
|
113 |
+
for i in tqdm(range(len(audios_ref))):
|
114 |
+
audio_ref = audios_ref[i]
|
115 |
+
audio_deg = audios_deg[i]
|
116 |
+
tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, kwargs=kwargs)
|
117 |
+
tp_total += tp
|
118 |
+
fp_total += fp
|
119 |
+
fn_total += fn
|
120 |
+
|
121 |
+
result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
|
122 |
+
else:
|
123 |
+
scores = []
|
124 |
+
for i in tqdm(range(len(audios_ref))):
|
125 |
+
audio_ref = audios_ref[i]
|
126 |
+
audio_deg = audios_deg[i]
|
127 |
+
|
128 |
+
if metric in ["wer", "cer"]:
|
129 |
+
model = whisper.load_model("large")
|
130 |
+
mode = kwargs["intelligibility_mode"]
|
131 |
+
if torch.cuda.is_available():
|
132 |
+
device = torch.device("cuda")
|
133 |
+
model = model.to(device)
|
134 |
+
|
135 |
+
if mode == "gt_audio":
|
136 |
+
kwargs["audio_ref"] = audio_ref
|
137 |
+
kwargs["audio_deg"] = audio_deg
|
138 |
+
score = METRIC_FUNC[metric](
|
139 |
+
model,
|
140 |
+
kwargs=kwargs,
|
141 |
+
)
|
142 |
+
elif mode == "gt_content":
|
143 |
+
kwargs["content_gt"] = ltrs[i]
|
144 |
+
kwargs["audio_deg"] = audio_deg
|
145 |
+
score = METRIC_FUNC[metric](
|
146 |
+
model,
|
147 |
+
kwargs=kwargs,
|
148 |
+
)
|
149 |
+
else:
|
150 |
+
score = METRIC_FUNC[metric](
|
151 |
+
audio_ref,
|
152 |
+
audio_deg,
|
153 |
+
kwargs=kwargs,
|
154 |
+
)
|
155 |
+
if not np.isnan(score):
|
156 |
+
scores.append(score)
|
157 |
+
|
158 |
+
scores = np.array(scores)
|
159 |
+
result["{}".format(metric)] = str(np.mean(scores))
|
160 |
+
|
161 |
+
data = json.dumps(result, indent=4)
|
162 |
+
|
163 |
+
with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
|
164 |
+
f.write(data)
|
165 |
+
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
parser = argparse.ArgumentParser()
|
169 |
+
parser.add_argument(
|
170 |
+
"--ref_dir",
|
171 |
+
type=str,
|
172 |
+
help="Path to the reference audio folder.",
|
173 |
+
)
|
174 |
+
parser.add_argument(
|
175 |
+
"--deg_dir",
|
176 |
+
type=str,
|
177 |
+
help="Path to the test audio folder.",
|
178 |
+
)
|
179 |
+
parser.add_argument(
|
180 |
+
"--dump_dir",
|
181 |
+
type=str,
|
182 |
+
help="Path to dump the results.",
|
183 |
+
)
|
184 |
+
parser.add_argument(
|
185 |
+
"--metrics",
|
186 |
+
nargs="+",
|
187 |
+
help="Metrics used to evaluate.",
|
188 |
+
)
|
189 |
+
parser.add_argument(
|
190 |
+
"--fs",
|
191 |
+
type=str,
|
192 |
+
default="None",
|
193 |
+
help="(Optional) Sampling rate",
|
194 |
+
)
|
195 |
+
parser.add_argument(
|
196 |
+
"--align_method",
|
197 |
+
type=str,
|
198 |
+
default="dtw",
|
199 |
+
help="(Optional) Method for aligning feature length. ['cut', 'dtw']",
|
200 |
+
)
|
201 |
+
|
202 |
+
parser.add_argument(
|
203 |
+
"--db_scale",
|
204 |
+
type=str,
|
205 |
+
default="True",
|
206 |
+
help="(Optional) Wether or not computing energy related metrics in db scale.",
|
207 |
+
)
|
208 |
+
parser.add_argument(
|
209 |
+
"--f0_subtract_mean",
|
210 |
+
type=str,
|
211 |
+
default="True",
|
212 |
+
help="(Optional) Wether or not computing f0 related metrics with mean value subtracted.",
|
213 |
+
)
|
214 |
+
|
215 |
+
parser.add_argument(
|
216 |
+
"--similarity_model",
|
217 |
+
type=str,
|
218 |
+
default="wavlm",
|
219 |
+
help="(Optional)The model for computing speaker similarity. ['rawnet', 'wavlm', 'resemblyzer']",
|
220 |
+
)
|
221 |
+
parser.add_argument(
|
222 |
+
"--similarity_mode",
|
223 |
+
type=str,
|
224 |
+
default="pairwith",
|
225 |
+
help="(Optional)The method of calculating similarity, where set to overall means computing \
|
226 |
+
the speaker similarity between two folder of audios content freely, and set to pairwith means \
|
227 |
+
computing the speaker similarity between a seires of paired gt/pred audios",
|
228 |
+
)
|
229 |
+
|
230 |
+
parser.add_argument(
|
231 |
+
"--ltr_path",
|
232 |
+
type=str,
|
233 |
+
default="None",
|
234 |
+
help="(Optional)Path to the transcription file,Note that the format in the transcription \
|
235 |
+
file is 'file name|transcription'",
|
236 |
+
)
|
237 |
+
parser.add_argument(
|
238 |
+
"--intelligibility_mode",
|
239 |
+
type=str,
|
240 |
+
default="gt_audio",
|
241 |
+
help="(Optional)The method of calculating WER and CER, where set to gt_audio means selecting \
|
242 |
+
the recognition content of the reference audio as the target, and set to gt_content means \
|
243 |
+
using transcription as the target",
|
244 |
+
)
|
245 |
+
parser.add_argument(
|
246 |
+
"--language",
|
247 |
+
type=str,
|
248 |
+
default="english",
|
249 |
+
help="(Optional)['english','chinese']",
|
250 |
+
)
|
251 |
+
|
252 |
+
args = parser.parse_args()
|
253 |
+
|
254 |
+
calc_metric(
|
255 |
+
args.ref_dir,
|
256 |
+
args.deg_dir,
|
257 |
+
args.dump_dir,
|
258 |
+
args.metrics,
|
259 |
+
fs=int(args.fs) if args.fs != "None" else None,
|
260 |
+
method=args.align_method,
|
261 |
+
db_scale=True if args.db_scale == "True" else False,
|
262 |
+
need_mean=True if args.f0_subtract_mean == "True" else False,
|
263 |
+
model_name=args.similarity_model,
|
264 |
+
similarity_mode=args.similarity_mode,
|
265 |
+
ltr_path=args.ltr_path,
|
266 |
+
intelligibility_mode=args.intelligibility_mode,
|
267 |
+
language=args.language,
|
268 |
+
)
|
bins/codec/inference.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
from argparse import ArgumentParser
|
8 |
+
import os
|
9 |
+
|
10 |
+
from models.codec.facodec.facodec_inference import FAcodecInference
|
11 |
+
from utils.util import load_config
|
12 |
+
import torch
|
13 |
+
|
14 |
+
|
15 |
+
def build_inference(args, cfg):
|
16 |
+
supported_inference = {
|
17 |
+
"FAcodec": FAcodecInference,
|
18 |
+
}
|
19 |
+
|
20 |
+
inference_class = supported_inference[cfg.model_type]
|
21 |
+
inference = inference_class(args, cfg)
|
22 |
+
return inference
|
23 |
+
|
24 |
+
|
25 |
+
def cuda_relevant(deterministic=False):
|
26 |
+
torch.cuda.empty_cache()
|
27 |
+
# TF32 on Ampere and above
|
28 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
29 |
+
torch.backends.cudnn.enabled = True
|
30 |
+
torch.backends.cudnn.allow_tf32 = True
|
31 |
+
# Deterministic
|
32 |
+
torch.backends.cudnn.deterministic = deterministic
|
33 |
+
torch.backends.cudnn.benchmark = not deterministic
|
34 |
+
torch.use_deterministic_algorithms(deterministic)
|
35 |
+
|
36 |
+
|
37 |
+
def build_parser():
|
38 |
+
parser = argparse.ArgumentParser()
|
39 |
+
|
40 |
+
parser.add_argument(
|
41 |
+
"--config",
|
42 |
+
type=str,
|
43 |
+
required=True,
|
44 |
+
help="JSON/YAML file for configurations.",
|
45 |
+
)
|
46 |
+
parser.add_argument(
|
47 |
+
"--checkpoint_path",
|
48 |
+
type=str,
|
49 |
+
default=None,
|
50 |
+
help="Acoustic model checkpoint directory. If a directory is given, "
|
51 |
+
"search for the latest checkpoint dir in the directory. If a specific "
|
52 |
+
"checkpoint dir is given, directly load the checkpoint.",
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--source",
|
56 |
+
type=str,
|
57 |
+
required=True,
|
58 |
+
help="Path to the source audio file",
|
59 |
+
)
|
60 |
+
parser.add_argument(
|
61 |
+
"--reference",
|
62 |
+
type=str,
|
63 |
+
default=None,
|
64 |
+
help="Path to the reference audio file, passing an",
|
65 |
+
)
|
66 |
+
parser.add_argument(
|
67 |
+
"--output_dir",
|
68 |
+
type=str,
|
69 |
+
default=None,
|
70 |
+
help="Output dir for saving generated results",
|
71 |
+
)
|
72 |
+
return parser
|
73 |
+
|
74 |
+
|
75 |
+
def main():
|
76 |
+
# Parse arguments
|
77 |
+
parser = build_parser()
|
78 |
+
args = parser.parse_args()
|
79 |
+
print(args)
|
80 |
+
|
81 |
+
# Parse config
|
82 |
+
cfg = load_config(args.config)
|
83 |
+
|
84 |
+
# CUDA settings
|
85 |
+
cuda_relevant()
|
86 |
+
|
87 |
+
# Build inference
|
88 |
+
inferencer = build_inference(args, cfg)
|
89 |
+
|
90 |
+
# Run inference
|
91 |
+
_ = inferencer.inference(args.source, args.output_dir)
|
92 |
+
|
93 |
+
# Run voice conversion
|
94 |
+
if args.reference is not None:
|
95 |
+
_ = inferencer.voice_conversion(args.source, args.reference, args.output_dir)
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
main()
|
bins/codec/train.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from models.codec.facodec.facodec_trainer import FAcodecTrainer
|
11 |
+
|
12 |
+
from utils.util import load_config
|
13 |
+
|
14 |
+
|
15 |
+
def build_trainer(args, cfg):
|
16 |
+
supported_trainer = {
|
17 |
+
"FAcodec": FAcodecTrainer,
|
18 |
+
}
|
19 |
+
|
20 |
+
trainer_class = supported_trainer[cfg.model_type]
|
21 |
+
trainer = trainer_class(args, cfg)
|
22 |
+
return trainer
|
23 |
+
|
24 |
+
|
25 |
+
def cuda_relevant(deterministic=False):
|
26 |
+
torch.cuda.empty_cache()
|
27 |
+
# TF32 on Ampere and above
|
28 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
29 |
+
torch.backends.cudnn.enabled = True
|
30 |
+
torch.backends.cudnn.benchmark = False
|
31 |
+
torch.backends.cudnn.allow_tf32 = True
|
32 |
+
# Deterministic
|
33 |
+
torch.backends.cudnn.deterministic = deterministic
|
34 |
+
torch.backends.cudnn.benchmark = not deterministic
|
35 |
+
torch.use_deterministic_algorithms(deterministic)
|
36 |
+
|
37 |
+
|
38 |
+
def main():
|
39 |
+
parser = argparse.ArgumentParser()
|
40 |
+
parser.add_argument(
|
41 |
+
"--config",
|
42 |
+
default="config.json",
|
43 |
+
help="json files for configurations.",
|
44 |
+
required=True,
|
45 |
+
)
|
46 |
+
parser.add_argument(
|
47 |
+
"--exp_name",
|
48 |
+
type=str,
|
49 |
+
default="exp_name",
|
50 |
+
help="A specific name to note the experiment",
|
51 |
+
required=True,
|
52 |
+
)
|
53 |
+
parser.add_argument(
|
54 |
+
"--resume_type",
|
55 |
+
type=str,
|
56 |
+
help="resume for continue to train, finetune for finetuning",
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--checkpoint",
|
60 |
+
type=str,
|
61 |
+
help="checkpoint to resume",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--log_level", default="warning", help="logging level (debug, info, warning)"
|
65 |
+
)
|
66 |
+
args = parser.parse_args()
|
67 |
+
cfg = load_config(args.config)
|
68 |
+
|
69 |
+
# CUDA settings
|
70 |
+
cuda_relevant()
|
71 |
+
|
72 |
+
# Build trainer
|
73 |
+
trainer = build_trainer(args, cfg)
|
74 |
+
|
75 |
+
trainer.train_loop()
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
main()
|
bins/svc/inference.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import os
|
8 |
+
import glob
|
9 |
+
from tqdm import tqdm
|
10 |
+
import json
|
11 |
+
import torch
|
12 |
+
import time
|
13 |
+
|
14 |
+
from models.svc.diffusion.diffusion_inference import DiffusionInference
|
15 |
+
from models.svc.comosvc.comosvc_inference import ComoSVCInference
|
16 |
+
from models.svc.transformer.transformer_inference import TransformerInference
|
17 |
+
from models.svc.vits.vits_inference import VitsInference
|
18 |
+
from utils.util import load_config
|
19 |
+
from utils.audio_slicer import split_audio, merge_segments_encodec
|
20 |
+
from processors import acoustic_extractor, content_extractor
|
21 |
+
|
22 |
+
|
23 |
+
def build_inference(args, cfg, infer_type="from_dataset"):
|
24 |
+
supported_inference = {
|
25 |
+
"DiffWaveNetSVC": DiffusionInference,
|
26 |
+
"DiffComoSVC": ComoSVCInference,
|
27 |
+
"TransformerSVC": TransformerInference,
|
28 |
+
"VitsSVC": VitsInference,
|
29 |
+
}
|
30 |
+
|
31 |
+
inference_class = supported_inference[cfg.model_type]
|
32 |
+
return inference_class(args, cfg, infer_type)
|
33 |
+
|
34 |
+
|
35 |
+
def prepare_for_audio_file(args, cfg, num_workers=1):
|
36 |
+
preprocess_path = cfg.preprocess.processed_dir
|
37 |
+
audio_name = cfg.inference.source_audio_name
|
38 |
+
temp_audio_dir = os.path.join(preprocess_path, audio_name)
|
39 |
+
|
40 |
+
### eval file
|
41 |
+
t = time.time()
|
42 |
+
eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
|
43 |
+
args.source = eval_file
|
44 |
+
with open(eval_file, "r") as f:
|
45 |
+
metadata = json.load(f)
|
46 |
+
print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
|
47 |
+
|
48 |
+
### acoustic features
|
49 |
+
t = time.time()
|
50 |
+
acoustic_extractor.extract_utt_acoustic_features_serial(
|
51 |
+
metadata, temp_audio_dir, cfg
|
52 |
+
)
|
53 |
+
if cfg.preprocess.use_min_max_norm_mel == True:
|
54 |
+
acoustic_extractor.cal_mel_min_max(
|
55 |
+
dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
|
56 |
+
)
|
57 |
+
acoustic_extractor.cal_pitch_statistics_svc(
|
58 |
+
dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
|
59 |
+
)
|
60 |
+
print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
|
61 |
+
|
62 |
+
### content features
|
63 |
+
t = time.time()
|
64 |
+
content_extractor.extract_utt_content_features_dataloader(
|
65 |
+
cfg, metadata, num_workers
|
66 |
+
)
|
67 |
+
print("Prepare for content features: {:.1f}s".format(time.time() - t))
|
68 |
+
return args, cfg, temp_audio_dir
|
69 |
+
|
70 |
+
|
71 |
+
def merge_for_audio_segments(audio_files, args, cfg):
|
72 |
+
audio_name = cfg.inference.source_audio_name
|
73 |
+
target_singer_name = args.target_singer
|
74 |
+
|
75 |
+
merge_segments_encodec(
|
76 |
+
wav_files=audio_files,
|
77 |
+
fs=cfg.preprocess.sample_rate,
|
78 |
+
output_path=os.path.join(
|
79 |
+
args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
|
80 |
+
),
|
81 |
+
overlap_duration=cfg.inference.segments_overlap_duration,
|
82 |
+
)
|
83 |
+
|
84 |
+
for tmp_file in audio_files:
|
85 |
+
os.remove(tmp_file)
|
86 |
+
|
87 |
+
|
88 |
+
def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
|
89 |
+
"""
|
90 |
+
Prepare the eval file (json) for an audio
|
91 |
+
"""
|
92 |
+
|
93 |
+
audio_chunks_results = split_audio(
|
94 |
+
wav_file=cfg.inference.source_audio_path,
|
95 |
+
target_sr=cfg.preprocess.sample_rate,
|
96 |
+
output_dir=os.path.join(temp_audio_dir, "wavs"),
|
97 |
+
max_duration_of_segment=cfg.inference.segments_max_duration,
|
98 |
+
overlap_duration=cfg.inference.segments_overlap_duration,
|
99 |
+
)
|
100 |
+
|
101 |
+
metadata = []
|
102 |
+
for i, res in enumerate(audio_chunks_results):
|
103 |
+
res["index"] = i
|
104 |
+
res["Dataset"] = audio_name
|
105 |
+
res["Singer"] = audio_name
|
106 |
+
res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
|
107 |
+
metadata.append(res)
|
108 |
+
|
109 |
+
eval_file = os.path.join(temp_audio_dir, "eval.json")
|
110 |
+
with open(eval_file, "w") as f:
|
111 |
+
json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
|
112 |
+
|
113 |
+
return eval_file
|
114 |
+
|
115 |
+
|
116 |
+
def cuda_relevant(deterministic=False):
|
117 |
+
torch.cuda.empty_cache()
|
118 |
+
# TF32 on Ampere and above
|
119 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
120 |
+
torch.backends.cudnn.enabled = True
|
121 |
+
torch.backends.cudnn.allow_tf32 = True
|
122 |
+
# Deterministic
|
123 |
+
torch.backends.cudnn.deterministic = deterministic
|
124 |
+
torch.backends.cudnn.benchmark = not deterministic
|
125 |
+
torch.use_deterministic_algorithms(deterministic)
|
126 |
+
|
127 |
+
|
128 |
+
def infer(args, cfg, infer_type):
|
129 |
+
# Build inference
|
130 |
+
t = time.time()
|
131 |
+
trainer = build_inference(args, cfg, infer_type)
|
132 |
+
print("Model Init: {:.1f}s".format(time.time() - t))
|
133 |
+
|
134 |
+
# Run inference
|
135 |
+
t = time.time()
|
136 |
+
output_audio_files = trainer.inference()
|
137 |
+
print("Model inference: {:.1f}s".format(time.time() - t))
|
138 |
+
return output_audio_files
|
139 |
+
|
140 |
+
|
141 |
+
def build_parser():
|
142 |
+
r"""Build argument parser for inference.py.
|
143 |
+
Anything else should be put in an extra config YAML file.
|
144 |
+
"""
|
145 |
+
|
146 |
+
parser = argparse.ArgumentParser()
|
147 |
+
parser.add_argument(
|
148 |
+
"--config",
|
149 |
+
type=str,
|
150 |
+
required=True,
|
151 |
+
help="JSON/YAML file for configurations.",
|
152 |
+
)
|
153 |
+
parser.add_argument(
|
154 |
+
"--acoustics_dir",
|
155 |
+
type=str,
|
156 |
+
help="Acoustics model checkpoint directory. If a directory is given, "
|
157 |
+
"search for the latest checkpoint dir in the directory. If a specific "
|
158 |
+
"checkpoint dir is given, directly load the checkpoint.",
|
159 |
+
)
|
160 |
+
parser.add_argument(
|
161 |
+
"--vocoder_dir",
|
162 |
+
type=str,
|
163 |
+
required=True,
|
164 |
+
help="Vocoder checkpoint directory. Searching behavior is the same as "
|
165 |
+
"the acoustics one.",
|
166 |
+
)
|
167 |
+
parser.add_argument(
|
168 |
+
"--target_singer",
|
169 |
+
type=str,
|
170 |
+
required=True,
|
171 |
+
help="convert to a specific singer (e.g. --target_singers singer_id).",
|
172 |
+
)
|
173 |
+
parser.add_argument(
|
174 |
+
"--trans_key",
|
175 |
+
default=0,
|
176 |
+
help="0: no pitch shift; autoshift: pitch shift; int: key shift.",
|
177 |
+
)
|
178 |
+
parser.add_argument(
|
179 |
+
"--source",
|
180 |
+
type=str,
|
181 |
+
default="source_audio",
|
182 |
+
help="Source audio file or directory. If a JSON file is given, "
|
183 |
+
"inference from dataset is applied. If a directory is given, "
|
184 |
+
"inference from all wav/flac/mp3 audio files in the directory is applied. "
|
185 |
+
"Default: inference from all wav/flac/mp3 audio files in ./source_audio",
|
186 |
+
)
|
187 |
+
parser.add_argument(
|
188 |
+
"--output_dir",
|
189 |
+
type=str,
|
190 |
+
default="conversion_results",
|
191 |
+
help="Output directory. Default: ./conversion_results",
|
192 |
+
)
|
193 |
+
parser.add_argument(
|
194 |
+
"--log_level",
|
195 |
+
type=str,
|
196 |
+
default="warning",
|
197 |
+
help="Logging level. Default: warning",
|
198 |
+
)
|
199 |
+
parser.add_argument(
|
200 |
+
"--keep_cache",
|
201 |
+
action="store_true",
|
202 |
+
default=True,
|
203 |
+
help="Keep cache files. Only applicable to inference from files.",
|
204 |
+
)
|
205 |
+
parser.add_argument(
|
206 |
+
"--diffusion_inference_steps",
|
207 |
+
type=int,
|
208 |
+
default=1000,
|
209 |
+
help="Number of inference steps. Only applicable to diffusion inference.",
|
210 |
+
)
|
211 |
+
return parser
|
212 |
+
|
213 |
+
|
214 |
+
def main():
|
215 |
+
### Parse arguments and config
|
216 |
+
args = build_parser().parse_args()
|
217 |
+
cfg = load_config(args.config)
|
218 |
+
|
219 |
+
# CUDA settings
|
220 |
+
cuda_relevant()
|
221 |
+
|
222 |
+
if os.path.isdir(args.source):
|
223 |
+
### Infer from file
|
224 |
+
|
225 |
+
# Get all the source audio files (.wav, .flac, .mp3)
|
226 |
+
source_audio_dir = args.source
|
227 |
+
audio_list = []
|
228 |
+
for suffix in ["wav", "flac", "mp3"]:
|
229 |
+
audio_list += glob.glob(
|
230 |
+
os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
|
231 |
+
)
|
232 |
+
print("There are {} source audios: ".format(len(audio_list)))
|
233 |
+
|
234 |
+
# Infer for every file as dataset
|
235 |
+
output_root_path = args.output_dir
|
236 |
+
for audio_path in tqdm(audio_list):
|
237 |
+
audio_name = audio_path.split("/")[-1].split(".")[0]
|
238 |
+
args.output_dir = os.path.join(output_root_path, audio_name)
|
239 |
+
print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
|
240 |
+
|
241 |
+
cfg.inference.source_audio_path = audio_path
|
242 |
+
cfg.inference.source_audio_name = audio_name
|
243 |
+
cfg.inference.segments_max_duration = 10.0
|
244 |
+
cfg.inference.segments_overlap_duration = 1.0
|
245 |
+
|
246 |
+
# Prepare metadata and features
|
247 |
+
args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
|
248 |
+
|
249 |
+
# Infer from file
|
250 |
+
output_audio_files = infer(args, cfg, infer_type="from_file")
|
251 |
+
|
252 |
+
# Merge the split segments
|
253 |
+
merge_for_audio_segments(output_audio_files, args, cfg)
|
254 |
+
|
255 |
+
# Keep or remove caches
|
256 |
+
if not args.keep_cache:
|
257 |
+
os.removedirs(cache_dir)
|
258 |
+
|
259 |
+
else:
|
260 |
+
### Infer from dataset
|
261 |
+
infer(args, cfg, infer_type="from_dataset")
|
262 |
+
|
263 |
+
|
264 |
+
if __name__ == "__main__":
|
265 |
+
main()
|
bins/svc/preprocess.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import faulthandler
|
7 |
+
|
8 |
+
faulthandler.enable()
|
9 |
+
|
10 |
+
import os
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
from multiprocessing import cpu_count
|
14 |
+
|
15 |
+
|
16 |
+
from utils.util import load_config
|
17 |
+
from preprocessors.processor import preprocess_dataset
|
18 |
+
from preprocessors.metadata import cal_metadata
|
19 |
+
from processors import acoustic_extractor, content_extractor, data_augment
|
20 |
+
|
21 |
+
|
22 |
+
def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
|
23 |
+
"""Extract acoustic features of utterances in the dataset
|
24 |
+
|
25 |
+
Args:
|
26 |
+
dataset (str): name of dataset, e.g. opencpop
|
27 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
28 |
+
cfg (dict): dictionary that stores configurations
|
29 |
+
n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
|
30 |
+
"""
|
31 |
+
types = ["train", "test"] if "eval" not in dataset else ["test"]
|
32 |
+
metadata = []
|
33 |
+
dataset_output = os.path.join(output_path, dataset)
|
34 |
+
|
35 |
+
for dataset_type in types:
|
36 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
37 |
+
with open(dataset_file, "r") as f:
|
38 |
+
metadata.extend(json.load(f))
|
39 |
+
|
40 |
+
# acoustic_extractor.extract_utt_acoustic_features_parallel(
|
41 |
+
# metadata, dataset_output, cfg, n_workers=n_workers
|
42 |
+
# )
|
43 |
+
acoustic_extractor.extract_utt_acoustic_features_serial(
|
44 |
+
metadata, dataset_output, cfg
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
def extract_content_features(dataset, output_path, cfg, num_workers=1):
|
49 |
+
"""Extract content features of utterances in the dataset
|
50 |
+
|
51 |
+
Args:
|
52 |
+
dataset (str): name of dataset, e.g. opencpop
|
53 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
54 |
+
cfg (dict): dictionary that stores configurations
|
55 |
+
"""
|
56 |
+
types = ["train", "test"] if "eval" not in dataset else ["test"]
|
57 |
+
metadata = []
|
58 |
+
for dataset_type in types:
|
59 |
+
dataset_output = os.path.join(output_path, dataset)
|
60 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
61 |
+
with open(dataset_file, "r") as f:
|
62 |
+
metadata.extend(json.load(f))
|
63 |
+
|
64 |
+
content_extractor.extract_utt_content_features_dataloader(
|
65 |
+
cfg, metadata, num_workers
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
def preprocess(cfg, args):
|
70 |
+
"""Proprocess raw data of single or multiple datasets (in cfg.dataset)
|
71 |
+
|
72 |
+
Args:
|
73 |
+
cfg (dict): dictionary that stores configurations
|
74 |
+
args (ArgumentParser): specify the configuration file and num_workers
|
75 |
+
"""
|
76 |
+
# Specify the output root path to save the processed data
|
77 |
+
output_path = cfg.preprocess.processed_dir
|
78 |
+
os.makedirs(output_path, exist_ok=True)
|
79 |
+
|
80 |
+
## Split train and test sets
|
81 |
+
for dataset in cfg.dataset:
|
82 |
+
print("Preprocess {}...".format(dataset))
|
83 |
+
preprocess_dataset(
|
84 |
+
dataset,
|
85 |
+
cfg.dataset_path[dataset],
|
86 |
+
output_path,
|
87 |
+
cfg.preprocess,
|
88 |
+
cfg.task_type,
|
89 |
+
is_custom_dataset=dataset in cfg.use_custom_dataset,
|
90 |
+
)
|
91 |
+
|
92 |
+
# Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
|
93 |
+
try:
|
94 |
+
assert isinstance(
|
95 |
+
cfg.preprocess.data_augment, list
|
96 |
+
), "Please provide a list of datasets need to be augmented."
|
97 |
+
if len(cfg.preprocess.data_augment) > 0:
|
98 |
+
new_datasets_list = []
|
99 |
+
for dataset in cfg.preprocess.data_augment:
|
100 |
+
new_datasets = data_augment.augment_dataset(cfg, dataset)
|
101 |
+
new_datasets_list.extend(new_datasets)
|
102 |
+
cfg.dataset.extend(new_datasets_list)
|
103 |
+
print("Augmentation datasets: ", cfg.dataset)
|
104 |
+
except:
|
105 |
+
print("No Data Augmentation.")
|
106 |
+
|
107 |
+
# Dump metadata of datasets (singers, train/test durations, etc.)
|
108 |
+
cal_metadata(cfg)
|
109 |
+
|
110 |
+
## Prepare the acoustic features
|
111 |
+
for dataset in cfg.dataset:
|
112 |
+
# Skip augmented datasets which do not need to extract acoustic features
|
113 |
+
# We will copy acoustic features from the original dataset later
|
114 |
+
if (
|
115 |
+
"pitch_shift" in dataset
|
116 |
+
or "formant_shift" in dataset
|
117 |
+
or "equalizer" in dataset in dataset
|
118 |
+
):
|
119 |
+
continue
|
120 |
+
print(
|
121 |
+
"Extracting acoustic features for {} using {} workers ...".format(
|
122 |
+
dataset, args.num_workers
|
123 |
+
)
|
124 |
+
)
|
125 |
+
extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
|
126 |
+
# Calculate the statistics of acoustic features
|
127 |
+
if cfg.preprocess.mel_min_max_norm:
|
128 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
129 |
+
|
130 |
+
if cfg.preprocess.extract_pitch:
|
131 |
+
acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
|
132 |
+
|
133 |
+
# Copy acoustic features for augmented datasets by creating soft-links
|
134 |
+
for dataset in cfg.dataset:
|
135 |
+
if "pitch_shift" in dataset:
|
136 |
+
src_dataset = dataset.replace("_pitch_shift", "")
|
137 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
138 |
+
elif "formant_shift" in dataset:
|
139 |
+
src_dataset = dataset.replace("_formant_shift", "")
|
140 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
141 |
+
elif "equalizer" in dataset:
|
142 |
+
src_dataset = dataset.replace("_equalizer", "")
|
143 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
144 |
+
else:
|
145 |
+
continue
|
146 |
+
dataset_dir = os.path.join(output_path, dataset)
|
147 |
+
metadata = []
|
148 |
+
for split in ["train", "test"] if not "eval" in dataset else ["test"]:
|
149 |
+
metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
|
150 |
+
with open(metadata_file_path, "r") as f:
|
151 |
+
metadata.extend(json.load(f))
|
152 |
+
print("Copying acoustic features for {}...".format(dataset))
|
153 |
+
acoustic_extractor.copy_acoustic_features(
|
154 |
+
metadata, dataset_dir, src_dataset_dir, cfg
|
155 |
+
)
|
156 |
+
if cfg.preprocess.mel_min_max_norm:
|
157 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
158 |
+
|
159 |
+
if cfg.preprocess.extract_pitch:
|
160 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
161 |
+
|
162 |
+
# Prepare the content features
|
163 |
+
for dataset in cfg.dataset:
|
164 |
+
print("Extracting content features for {}...".format(dataset))
|
165 |
+
extract_content_features(dataset, output_path, cfg, args.num_workers)
|
166 |
+
|
167 |
+
|
168 |
+
def main():
|
169 |
+
parser = argparse.ArgumentParser()
|
170 |
+
parser.add_argument(
|
171 |
+
"--config", default="config.json", help="json files for configurations."
|
172 |
+
)
|
173 |
+
parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
|
174 |
+
parser.add_argument("--prepare_alignment", type=bool, default=False)
|
175 |
+
|
176 |
+
args = parser.parse_args()
|
177 |
+
cfg = load_config(args.config)
|
178 |
+
|
179 |
+
preprocess(cfg, args)
|
180 |
+
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
main()
|
bins/svc/train.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
|
11 |
+
from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
|
12 |
+
from models.svc.transformer.transformer_trainer import TransformerTrainer
|
13 |
+
from models.svc.vits.vits_trainer import VitsSVCTrainer
|
14 |
+
from utils.util import load_config
|
15 |
+
|
16 |
+
|
17 |
+
def build_trainer(args, cfg):
|
18 |
+
supported_trainer = {
|
19 |
+
"DiffWaveNetSVC": DiffusionTrainer,
|
20 |
+
"DiffComoSVC": ComoSVCTrainer,
|
21 |
+
"TransformerSVC": TransformerTrainer,
|
22 |
+
"VitsSVC": VitsSVCTrainer,
|
23 |
+
}
|
24 |
+
|
25 |
+
trainer_class = supported_trainer[cfg.model_type]
|
26 |
+
trainer = trainer_class(args, cfg)
|
27 |
+
return trainer
|
28 |
+
|
29 |
+
|
30 |
+
def cuda_relevant(deterministic=False):
|
31 |
+
torch.cuda.empty_cache()
|
32 |
+
# TF32 on Ampere and above
|
33 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
34 |
+
torch.backends.cudnn.enabled = True
|
35 |
+
torch.backends.cudnn.allow_tf32 = True
|
36 |
+
# Deterministic
|
37 |
+
torch.backends.cudnn.deterministic = deterministic
|
38 |
+
torch.backends.cudnn.benchmark = not deterministic
|
39 |
+
torch.use_deterministic_algorithms(deterministic)
|
40 |
+
|
41 |
+
|
42 |
+
def main():
|
43 |
+
parser = argparse.ArgumentParser()
|
44 |
+
parser.add_argument(
|
45 |
+
"--config",
|
46 |
+
default="config.json",
|
47 |
+
help="json files for configurations.",
|
48 |
+
required=True,
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"--exp_name",
|
52 |
+
type=str,
|
53 |
+
default="exp_name",
|
54 |
+
help="A specific name to note the experiment",
|
55 |
+
required=True,
|
56 |
+
)
|
57 |
+
parser.add_argument(
|
58 |
+
"--resume",
|
59 |
+
action="store_true",
|
60 |
+
help="If specified, to resume from the existing checkpoint.",
|
61 |
+
)
|
62 |
+
parser.add_argument(
|
63 |
+
"--resume_from_ckpt_path",
|
64 |
+
type=str,
|
65 |
+
default="",
|
66 |
+
help="The specific checkpoint path that you want to resume from.",
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--resume_type",
|
70 |
+
type=str,
|
71 |
+
default="",
|
72 |
+
help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
|
73 |
+
)
|
74 |
+
|
75 |
+
parser.add_argument(
|
76 |
+
"--log_level", default="warning", help="logging level (debug, info, warning)"
|
77 |
+
)
|
78 |
+
args = parser.parse_args()
|
79 |
+
cfg = load_config(args.config)
|
80 |
+
|
81 |
+
# Data Augmentation
|
82 |
+
if (
|
83 |
+
type(cfg.preprocess.data_augment) == list
|
84 |
+
and len(cfg.preprocess.data_augment) > 0
|
85 |
+
):
|
86 |
+
new_datasets_list = []
|
87 |
+
for dataset in cfg.preprocess.data_augment:
|
88 |
+
new_datasets = [
|
89 |
+
f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
|
90 |
+
(
|
91 |
+
f"{dataset}_formant_shift"
|
92 |
+
if cfg.preprocess.use_formant_shift
|
93 |
+
else None
|
94 |
+
),
|
95 |
+
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
|
96 |
+
f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
|
97 |
+
]
|
98 |
+
new_datasets_list.extend(filter(None, new_datasets))
|
99 |
+
cfg.dataset.extend(new_datasets_list)
|
100 |
+
|
101 |
+
# CUDA settings
|
102 |
+
cuda_relevant()
|
103 |
+
|
104 |
+
# Build trainer
|
105 |
+
trainer = build_trainer(args, cfg)
|
106 |
+
|
107 |
+
trainer.train_loop()
|
108 |
+
|
109 |
+
|
110 |
+
if __name__ == "__main__":
|
111 |
+
main()
|
bins/tta/inference.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
from argparse import ArgumentParser
|
8 |
+
import os
|
9 |
+
|
10 |
+
from models.tta.ldm.audioldm_inference import AudioLDMInference
|
11 |
+
from utils.util import save_config, load_model_config, load_config
|
12 |
+
import numpy as np
|
13 |
+
import torch
|
14 |
+
|
15 |
+
|
16 |
+
def build_inference(args, cfg):
|
17 |
+
supported_inference = {
|
18 |
+
"AudioLDM": AudioLDMInference,
|
19 |
+
}
|
20 |
+
|
21 |
+
inference_class = supported_inference[cfg.model_type]
|
22 |
+
inference = inference_class(args, cfg)
|
23 |
+
return inference
|
24 |
+
|
25 |
+
|
26 |
+
def build_parser():
|
27 |
+
parser = argparse.ArgumentParser()
|
28 |
+
|
29 |
+
parser.add_argument(
|
30 |
+
"--config",
|
31 |
+
type=str,
|
32 |
+
required=True,
|
33 |
+
help="JSON/YAML file for configurations.",
|
34 |
+
)
|
35 |
+
parser.add_argument(
|
36 |
+
"--text",
|
37 |
+
help="Text to be synthesized",
|
38 |
+
type=str,
|
39 |
+
default="Text to be synthesized.",
|
40 |
+
)
|
41 |
+
parser.add_argument(
|
42 |
+
"--checkpoint_path",
|
43 |
+
type=str,
|
44 |
+
)
|
45 |
+
parser.add_argument(
|
46 |
+
"--vocoder_path", type=str, help="Checkpoint path of the vocoder"
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"--vocoder_config_path", type=str, help="Config path of the vocoder"
|
50 |
+
)
|
51 |
+
parser.add_argument(
|
52 |
+
"--output_dir",
|
53 |
+
type=str,
|
54 |
+
default=None,
|
55 |
+
help="Output dir for saving generated results",
|
56 |
+
)
|
57 |
+
parser.add_argument(
|
58 |
+
"--num_steps",
|
59 |
+
type=int,
|
60 |
+
default=200,
|
61 |
+
help="The total number of denosing steps",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--guidance_scale",
|
65 |
+
type=float,
|
66 |
+
default=4.0,
|
67 |
+
help="The scale of classifer free guidance",
|
68 |
+
)
|
69 |
+
parser.add_argument("--local_rank", default=-1, type=int)
|
70 |
+
return parser
|
71 |
+
|
72 |
+
|
73 |
+
def main():
|
74 |
+
# Parse arguments
|
75 |
+
args = build_parser().parse_args()
|
76 |
+
# args, infer_type = formulate_parser(args)
|
77 |
+
|
78 |
+
# Parse config
|
79 |
+
cfg = load_config(args.config)
|
80 |
+
if torch.cuda.is_available():
|
81 |
+
args.local_rank = torch.device("cuda")
|
82 |
+
else:
|
83 |
+
args.local_rank = torch.device("cpu")
|
84 |
+
print("args: ", args)
|
85 |
+
|
86 |
+
# Build inference
|
87 |
+
inferencer = build_inference(args, cfg)
|
88 |
+
|
89 |
+
# Run inference
|
90 |
+
inferencer.inference()
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
main()
|
bins/tta/preprocess.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import faulthandler
|
7 |
+
|
8 |
+
faulthandler.enable()
|
9 |
+
|
10 |
+
import os
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
import pyworld as pw
|
14 |
+
from multiprocessing import cpu_count
|
15 |
+
|
16 |
+
|
17 |
+
from utils.util import load_config
|
18 |
+
from preprocessors.processor import preprocess_dataset, prepare_align
|
19 |
+
from preprocessors.metadata import cal_metadata
|
20 |
+
from processors import acoustic_extractor, content_extractor, data_augment
|
21 |
+
|
22 |
+
|
23 |
+
def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
|
24 |
+
"""Extract acoustic features of utterances in the dataset
|
25 |
+
|
26 |
+
Args:
|
27 |
+
dataset (str): name of dataset, e.g. opencpop
|
28 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
29 |
+
cfg (dict): dictionary that stores configurations
|
30 |
+
n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
|
31 |
+
"""
|
32 |
+
types = ["train", "test"] if "eval" not in dataset else ["test"]
|
33 |
+
metadata = []
|
34 |
+
for dataset_type in types:
|
35 |
+
dataset_output = os.path.join(output_path, dataset)
|
36 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
37 |
+
with open(dataset_file, "r") as f:
|
38 |
+
metadata.extend(json.load(f))
|
39 |
+
|
40 |
+
# acoustic_extractor.extract_utt_acoustic_features_parallel(
|
41 |
+
# metadata, dataset_output, cfg, n_workers=n_workers
|
42 |
+
# )
|
43 |
+
acoustic_extractor.extract_utt_acoustic_features_serial(
|
44 |
+
metadata, dataset_output, cfg
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
def extract_content_features(dataset, output_path, cfg, num_workers=1):
|
49 |
+
"""Extract content features of utterances in the dataset
|
50 |
+
|
51 |
+
Args:
|
52 |
+
dataset (str): name of dataset, e.g. opencpop
|
53 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
54 |
+
cfg (dict): dictionary that stores configurations
|
55 |
+
"""
|
56 |
+
types = ["train", "test"] if "eval" not in dataset else ["test"]
|
57 |
+
metadata = []
|
58 |
+
for dataset_type in types:
|
59 |
+
dataset_output = os.path.join(output_path, dataset)
|
60 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
61 |
+
with open(dataset_file, "r") as f:
|
62 |
+
metadata.extend(json.load(f))
|
63 |
+
|
64 |
+
content_extractor.extract_utt_content_features_dataloader(
|
65 |
+
cfg, metadata, num_workers
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
def preprocess(cfg, args):
|
70 |
+
"""Proprocess raw data of single or multiple datasets (in cfg.dataset)
|
71 |
+
|
72 |
+
Args:
|
73 |
+
cfg (dict): dictionary that stores configurations
|
74 |
+
args (ArgumentParser): specify the configuration file and num_workers
|
75 |
+
"""
|
76 |
+
# Specify the output root path to save the processed data
|
77 |
+
output_path = cfg.preprocess.processed_dir
|
78 |
+
os.makedirs(output_path, exist_ok=True)
|
79 |
+
|
80 |
+
## Split train and test sets
|
81 |
+
for dataset in cfg.dataset:
|
82 |
+
print("Preprocess {}...".format(dataset))
|
83 |
+
|
84 |
+
if args.prepare_alignment:
|
85 |
+
## Prepare alignment with MFA
|
86 |
+
print("Prepare alignment {}...".format(dataset))
|
87 |
+
prepare_align(
|
88 |
+
dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
|
89 |
+
)
|
90 |
+
preprocess_dataset(
|
91 |
+
dataset,
|
92 |
+
cfg.dataset_path[dataset],
|
93 |
+
output_path,
|
94 |
+
cfg.preprocess,
|
95 |
+
cfg.task_type,
|
96 |
+
is_custom_dataset=dataset in cfg.use_custom_dataset,
|
97 |
+
)
|
98 |
+
|
99 |
+
# Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
|
100 |
+
try:
|
101 |
+
assert isinstance(
|
102 |
+
cfg.preprocess.data_augment, list
|
103 |
+
), "Please provide a list of datasets need to be augmented."
|
104 |
+
if len(cfg.preprocess.data_augment) > 0:
|
105 |
+
new_datasets_list = []
|
106 |
+
for dataset in cfg.preprocess.data_augment:
|
107 |
+
new_datasets = data_augment.augment_dataset(cfg, dataset)
|
108 |
+
new_datasets_list.extend(new_datasets)
|
109 |
+
cfg.dataset.extend(new_datasets_list)
|
110 |
+
print("Augmentation datasets: ", cfg.dataset)
|
111 |
+
except:
|
112 |
+
print("No Data Augmentation.")
|
113 |
+
|
114 |
+
# Dump metadata of datasets (singers, train/test durations, etc.)
|
115 |
+
cal_metadata(cfg)
|
116 |
+
|
117 |
+
## Prepare the acoustic features
|
118 |
+
for dataset in cfg.dataset:
|
119 |
+
# Skip augmented datasets which do not need to extract acoustic features
|
120 |
+
# We will copy acoustic features from the original dataset later
|
121 |
+
if (
|
122 |
+
"pitch_shift" in dataset
|
123 |
+
or "formant_shift" in dataset
|
124 |
+
or "equalizer" in dataset in dataset
|
125 |
+
):
|
126 |
+
continue
|
127 |
+
print(
|
128 |
+
"Extracting acoustic features for {} using {} workers ...".format(
|
129 |
+
dataset, args.num_workers
|
130 |
+
)
|
131 |
+
)
|
132 |
+
extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
|
133 |
+
# Calculate the statistics of acoustic features
|
134 |
+
if cfg.preprocess.mel_min_max_norm:
|
135 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
136 |
+
|
137 |
+
if cfg.preprocess.extract_pitch:
|
138 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
139 |
+
if cfg.preprocess.extract_energy:
|
140 |
+
acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
|
141 |
+
|
142 |
+
if cfg.preprocess.align_mel_duration:
|
143 |
+
acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
|
144 |
+
|
145 |
+
# Copy acoustic features for augmented datasets by creating soft-links
|
146 |
+
for dataset in cfg.dataset:
|
147 |
+
if "pitch_shift" in dataset:
|
148 |
+
src_dataset = dataset.replace("_pitch_shift", "")
|
149 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
150 |
+
elif "formant_shift" in dataset:
|
151 |
+
src_dataset = dataset.replace("_formant_shift", "")
|
152 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
153 |
+
elif "equalizer" in dataset:
|
154 |
+
src_dataset = dataset.replace("_equalizer", "")
|
155 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
156 |
+
else:
|
157 |
+
continue
|
158 |
+
dataset_dir = os.path.join(output_path, dataset)
|
159 |
+
metadata = []
|
160 |
+
for split in ["train", "test"] if not "eval" in dataset else ["test"]:
|
161 |
+
metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
|
162 |
+
with open(metadata_file_path, "r") as f:
|
163 |
+
metadata.extend(json.load(f))
|
164 |
+
print("Copying acoustic features for {}...".format(dataset))
|
165 |
+
acoustic_extractor.copy_acoustic_features(
|
166 |
+
metadata, dataset_dir, src_dataset_dir, cfg
|
167 |
+
)
|
168 |
+
if cfg.preprocess.mel_min_max_norm:
|
169 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
170 |
+
|
171 |
+
if cfg.preprocess.extract_pitch:
|
172 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
173 |
+
|
174 |
+
# Prepare the content features
|
175 |
+
for dataset in cfg.dataset:
|
176 |
+
print("Extracting content features for {}...".format(dataset))
|
177 |
+
extract_content_features(dataset, output_path, cfg, args.num_workers)
|
178 |
+
|
179 |
+
|
180 |
+
def main():
|
181 |
+
parser = argparse.ArgumentParser()
|
182 |
+
parser.add_argument(
|
183 |
+
"--config", default="config.json", help="json files for configurations."
|
184 |
+
)
|
185 |
+
parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
|
186 |
+
parser.add_argument("--prepare_alignment", type=bool, default=False)
|
187 |
+
|
188 |
+
args = parser.parse_args()
|
189 |
+
cfg = load_config(args.config)
|
190 |
+
|
191 |
+
preprocess(cfg, args)
|
192 |
+
|
193 |
+
|
194 |
+
if __name__ == "__main__":
|
195 |
+
main()
|
bins/tta/train_tta.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import os
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
|
11 |
+
from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
|
12 |
+
from utils.util import load_config
|
13 |
+
|
14 |
+
|
15 |
+
def build_trainer(args, cfg):
|
16 |
+
supported_trainer = {
|
17 |
+
"AutoencoderKL": AutoencoderKLTrainer,
|
18 |
+
"AudioLDM": AudioLDMTrainer,
|
19 |
+
}
|
20 |
+
|
21 |
+
trainer_class = supported_trainer[cfg.model_type]
|
22 |
+
trainer = trainer_class(args, cfg)
|
23 |
+
return trainer
|
24 |
+
|
25 |
+
|
26 |
+
def main():
|
27 |
+
parser = argparse.ArgumentParser()
|
28 |
+
parser.add_argument(
|
29 |
+
"--config",
|
30 |
+
default="config.json",
|
31 |
+
help="json files for configurations.",
|
32 |
+
required=True,
|
33 |
+
)
|
34 |
+
parser.add_argument(
|
35 |
+
"--num_workers", type=int, default=6, help="Number of dataloader workers."
|
36 |
+
)
|
37 |
+
parser.add_argument(
|
38 |
+
"--exp_name",
|
39 |
+
type=str,
|
40 |
+
default="exp_name",
|
41 |
+
help="A specific name to note the experiment",
|
42 |
+
required=True,
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"--resume",
|
46 |
+
type=str,
|
47 |
+
default=None,
|
48 |
+
# action="store_true",
|
49 |
+
help="The model name to restore",
|
50 |
+
)
|
51 |
+
parser.add_argument(
|
52 |
+
"--log_level", default="info", help="logging level (info, debug, warning)"
|
53 |
+
)
|
54 |
+
parser.add_argument("--stdout_interval", default=5, type=int)
|
55 |
+
parser.add_argument("--local_rank", default=-1, type=int)
|
56 |
+
args = parser.parse_args()
|
57 |
+
cfg = load_config(args.config)
|
58 |
+
cfg.exp_name = args.exp_name
|
59 |
+
|
60 |
+
# Model saving dir
|
61 |
+
args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
|
62 |
+
os.makedirs(args.log_dir, exist_ok=True)
|
63 |
+
|
64 |
+
if not cfg.train.ddp:
|
65 |
+
args.local_rank = torch.device("cuda")
|
66 |
+
|
67 |
+
# Build trainer
|
68 |
+
trainer = build_trainer(args, cfg)
|
69 |
+
|
70 |
+
# Restore models
|
71 |
+
if args.resume:
|
72 |
+
trainer.restore()
|
73 |
+
trainer.train()
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
main()
|
bins/tts/inference.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
from argparse import ArgumentParser
|
8 |
+
import os
|
9 |
+
|
10 |
+
from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
|
11 |
+
from models.tts.vits.vits_inference import VitsInference
|
12 |
+
from models.tts.valle.valle_inference import VALLEInference
|
13 |
+
from models.tts.naturalspeech2.ns2_inference import NS2Inference
|
14 |
+
from models.tts.jets.jets_inference import JetsInference
|
15 |
+
from utils.util import load_config
|
16 |
+
import torch
|
17 |
+
|
18 |
+
|
19 |
+
def build_inference(args, cfg):
|
20 |
+
supported_inference = {
|
21 |
+
"FastSpeech2": FastSpeech2Inference,
|
22 |
+
"VITS": VitsInference,
|
23 |
+
"VALLE": VALLEInference,
|
24 |
+
"NaturalSpeech2": NS2Inference,
|
25 |
+
"Jets": JetsInference,
|
26 |
+
}
|
27 |
+
|
28 |
+
inference_class = supported_inference[cfg.model_type]
|
29 |
+
inference = inference_class(args, cfg)
|
30 |
+
return inference
|
31 |
+
|
32 |
+
|
33 |
+
def cuda_relevant(deterministic=False):
|
34 |
+
torch.cuda.empty_cache()
|
35 |
+
# TF32 on Ampere and above
|
36 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
37 |
+
torch.backends.cudnn.enabled = True
|
38 |
+
torch.backends.cudnn.allow_tf32 = True
|
39 |
+
# Deterministic
|
40 |
+
torch.backends.cudnn.deterministic = deterministic
|
41 |
+
torch.backends.cudnn.benchmark = not deterministic
|
42 |
+
torch.use_deterministic_algorithms(deterministic)
|
43 |
+
|
44 |
+
|
45 |
+
def build_parser():
|
46 |
+
parser = argparse.ArgumentParser()
|
47 |
+
|
48 |
+
parser.add_argument(
|
49 |
+
"--config",
|
50 |
+
type=str,
|
51 |
+
required=True,
|
52 |
+
help="JSON/YAML file for configurations.",
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--dataset",
|
56 |
+
type=str,
|
57 |
+
help="convert from the source data",
|
58 |
+
default=None,
|
59 |
+
)
|
60 |
+
parser.add_argument(
|
61 |
+
"--testing_set",
|
62 |
+
type=str,
|
63 |
+
help="train, test, golden_test",
|
64 |
+
default="test",
|
65 |
+
)
|
66 |
+
parser.add_argument(
|
67 |
+
"--test_list_file",
|
68 |
+
type=str,
|
69 |
+
help="convert from the test list file",
|
70 |
+
default=None,
|
71 |
+
)
|
72 |
+
parser.add_argument(
|
73 |
+
"--speaker_name",
|
74 |
+
type=str,
|
75 |
+
default=None,
|
76 |
+
help="speaker name for multi-speaker synthesis, for single-sentence mode only",
|
77 |
+
)
|
78 |
+
parser.add_argument(
|
79 |
+
"--text",
|
80 |
+
help="Text to be synthesized.",
|
81 |
+
type=str,
|
82 |
+
default="",
|
83 |
+
)
|
84 |
+
parser.add_argument(
|
85 |
+
"--vocoder_dir",
|
86 |
+
type=str,
|
87 |
+
default=None,
|
88 |
+
help="Vocoder checkpoint directory. Searching behavior is the same as "
|
89 |
+
"the acoustics one.",
|
90 |
+
)
|
91 |
+
parser.add_argument(
|
92 |
+
"--acoustics_dir",
|
93 |
+
type=str,
|
94 |
+
default=None,
|
95 |
+
help="Acoustic model checkpoint directory. If a directory is given, "
|
96 |
+
"search for the latest checkpoint dir in the directory. If a specific "
|
97 |
+
"checkpoint dir is given, directly load the checkpoint.",
|
98 |
+
)
|
99 |
+
parser.add_argument(
|
100 |
+
"--checkpoint_path",
|
101 |
+
type=str,
|
102 |
+
default=None,
|
103 |
+
help="Acoustic model checkpoint directory. If a directory is given, "
|
104 |
+
"search for the latest checkpoint dir in the directory. If a specific "
|
105 |
+
"checkpoint dir is given, directly load the checkpoint.",
|
106 |
+
)
|
107 |
+
parser.add_argument(
|
108 |
+
"--mode",
|
109 |
+
type=str,
|
110 |
+
choices=["batch", "single"],
|
111 |
+
required=True,
|
112 |
+
help="Synthesize a whole dataset or a single sentence",
|
113 |
+
)
|
114 |
+
parser.add_argument(
|
115 |
+
"--log_level",
|
116 |
+
type=str,
|
117 |
+
default="warning",
|
118 |
+
help="Logging level. Default: warning",
|
119 |
+
)
|
120 |
+
parser.add_argument(
|
121 |
+
"--pitch_control",
|
122 |
+
type=float,
|
123 |
+
default=1.0,
|
124 |
+
help="control the pitch of the whole utterance, larger value for higher pitch",
|
125 |
+
)
|
126 |
+
parser.add_argument(
|
127 |
+
"--energy_control",
|
128 |
+
type=float,
|
129 |
+
default=1.0,
|
130 |
+
help="control the energy of the whole utterance, larger value for larger volume",
|
131 |
+
)
|
132 |
+
parser.add_argument(
|
133 |
+
"--duration_control",
|
134 |
+
type=float,
|
135 |
+
default=1.0,
|
136 |
+
help="control the speed of the whole utterance, larger value for slower speaking rate",
|
137 |
+
)
|
138 |
+
parser.add_argument(
|
139 |
+
"--output_dir",
|
140 |
+
type=str,
|
141 |
+
default=None,
|
142 |
+
help="Output dir for saving generated results",
|
143 |
+
)
|
144 |
+
return parser
|
145 |
+
|
146 |
+
|
147 |
+
def main():
|
148 |
+
# Parse arguments
|
149 |
+
parser = build_parser()
|
150 |
+
VALLEInference.add_arguments(parser)
|
151 |
+
NS2Inference.add_arguments(parser)
|
152 |
+
args = parser.parse_args()
|
153 |
+
print(args)
|
154 |
+
|
155 |
+
# Parse config
|
156 |
+
cfg = load_config(args.config)
|
157 |
+
|
158 |
+
# CUDA settings
|
159 |
+
cuda_relevant()
|
160 |
+
|
161 |
+
# Build inference
|
162 |
+
inferencer = build_inference(args, cfg)
|
163 |
+
|
164 |
+
# Run inference
|
165 |
+
inferencer.inference()
|
166 |
+
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
main()
|
bins/tts/preprocess.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import faulthandler
|
7 |
+
|
8 |
+
faulthandler.enable()
|
9 |
+
|
10 |
+
import os
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
import pyworld as pw
|
14 |
+
from multiprocessing import cpu_count
|
15 |
+
|
16 |
+
|
17 |
+
from utils.util import load_config
|
18 |
+
from preprocessors.processor import preprocess_dataset, prepare_align
|
19 |
+
from preprocessors.metadata import cal_metadata
|
20 |
+
from processors import (
|
21 |
+
acoustic_extractor,
|
22 |
+
content_extractor,
|
23 |
+
data_augment,
|
24 |
+
phone_extractor,
|
25 |
+
)
|
26 |
+
|
27 |
+
|
28 |
+
def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
|
29 |
+
"""Extract acoustic features of utterances in the dataset
|
30 |
+
|
31 |
+
Args:
|
32 |
+
dataset (str): name of dataset, e.g. opencpop
|
33 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
34 |
+
cfg (dict): dictionary that stores configurations
|
35 |
+
n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
|
36 |
+
"""
|
37 |
+
|
38 |
+
metadata = []
|
39 |
+
for dataset_type in dataset_types:
|
40 |
+
dataset_output = os.path.join(output_path, dataset)
|
41 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
42 |
+
with open(dataset_file, "r") as f:
|
43 |
+
metadata.extend(json.load(f))
|
44 |
+
|
45 |
+
# acoustic_extractor.extract_utt_acoustic_features_parallel(
|
46 |
+
# metadata, dataset_output, cfg, n_workers=n_workers
|
47 |
+
# )
|
48 |
+
acoustic_extractor.extract_utt_acoustic_features_serial(
|
49 |
+
metadata, dataset_output, cfg
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
|
54 |
+
"""Extract content features of utterances in the dataset
|
55 |
+
|
56 |
+
Args:
|
57 |
+
dataset (str): name of dataset, e.g. opencpop
|
58 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
59 |
+
cfg (dict): dictionary that stores configurations
|
60 |
+
"""
|
61 |
+
|
62 |
+
metadata = []
|
63 |
+
for dataset_type in dataset_types:
|
64 |
+
dataset_output = os.path.join(output_path, dataset)
|
65 |
+
# dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
66 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
67 |
+
with open(dataset_file, "r") as f:
|
68 |
+
metadata.extend(json.load(f))
|
69 |
+
|
70 |
+
content_extractor.extract_utt_content_features_dataloader(
|
71 |
+
cfg, metadata, num_workers
|
72 |
+
)
|
73 |
+
|
74 |
+
|
75 |
+
def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
|
76 |
+
"""Extract phoneme features of utterances in the dataset
|
77 |
+
|
78 |
+
Args:
|
79 |
+
dataset (str): name of dataset, e.g. opencpop
|
80 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
81 |
+
cfg (dict): dictionary that stores configurations
|
82 |
+
|
83 |
+
"""
|
84 |
+
|
85 |
+
metadata = []
|
86 |
+
for dataset_type in dataset_types:
|
87 |
+
dataset_output = os.path.join(output_path, dataset)
|
88 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
89 |
+
with open(dataset_file, "r") as f:
|
90 |
+
metadata.extend(json.load(f))
|
91 |
+
phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata)
|
92 |
+
|
93 |
+
|
94 |
+
def preprocess(cfg, args):
|
95 |
+
"""Preprocess raw data of single or multiple datasets (in cfg.dataset)
|
96 |
+
|
97 |
+
Args:
|
98 |
+
cfg (dict): dictionary that stores configurations
|
99 |
+
args (ArgumentParser): specify the configuration file and num_workers
|
100 |
+
"""
|
101 |
+
# Specify the output root path to save the processed data
|
102 |
+
output_path = cfg.preprocess.processed_dir
|
103 |
+
os.makedirs(output_path, exist_ok=True)
|
104 |
+
|
105 |
+
# Split train and test sets
|
106 |
+
for dataset in cfg.dataset:
|
107 |
+
print("Preprocess {}...".format(dataset))
|
108 |
+
|
109 |
+
if args.prepare_alignment:
|
110 |
+
# Prepare alignment with MFA
|
111 |
+
print("Prepare alignment {}...".format(dataset))
|
112 |
+
prepare_align(
|
113 |
+
dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
|
114 |
+
)
|
115 |
+
|
116 |
+
preprocess_dataset(
|
117 |
+
dataset,
|
118 |
+
cfg.dataset_path[dataset],
|
119 |
+
output_path,
|
120 |
+
cfg.preprocess,
|
121 |
+
cfg.task_type,
|
122 |
+
is_custom_dataset=dataset in cfg.use_custom_dataset,
|
123 |
+
)
|
124 |
+
|
125 |
+
# Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
|
126 |
+
try:
|
127 |
+
assert isinstance(
|
128 |
+
cfg.preprocess.data_augment, list
|
129 |
+
), "Please provide a list of datasets need to be augmented."
|
130 |
+
if len(cfg.preprocess.data_augment) > 0:
|
131 |
+
new_datasets_list = []
|
132 |
+
for dataset in cfg.preprocess.data_augment:
|
133 |
+
new_datasets = data_augment.augment_dataset(cfg, dataset)
|
134 |
+
new_datasets_list.extend(new_datasets)
|
135 |
+
cfg.dataset.extend(new_datasets_list)
|
136 |
+
print("Augmentation datasets: ", cfg.dataset)
|
137 |
+
except:
|
138 |
+
print("No Data Augmentation.")
|
139 |
+
|
140 |
+
# json files
|
141 |
+
dataset_types = list()
|
142 |
+
dataset_types.append((cfg.preprocess.train_file).split(".")[0])
|
143 |
+
dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
|
144 |
+
if "test" not in dataset_types:
|
145 |
+
dataset_types.append("test")
|
146 |
+
if "eval" in dataset:
|
147 |
+
dataset_types = ["test"]
|
148 |
+
|
149 |
+
# Dump metadata of datasets (singers, train/test durations, etc.)
|
150 |
+
cal_metadata(cfg, dataset_types)
|
151 |
+
|
152 |
+
# Prepare the acoustic features
|
153 |
+
for dataset in cfg.dataset:
|
154 |
+
# Skip augmented datasets which do not need to extract acoustic features
|
155 |
+
# We will copy acoustic features from the original dataset later
|
156 |
+
if (
|
157 |
+
"pitch_shift" in dataset
|
158 |
+
or "formant_shift" in dataset
|
159 |
+
or "equalizer" in dataset in dataset
|
160 |
+
):
|
161 |
+
continue
|
162 |
+
print(
|
163 |
+
"Extracting acoustic features for {} using {} workers ...".format(
|
164 |
+
dataset, args.num_workers
|
165 |
+
)
|
166 |
+
)
|
167 |
+
extract_acoustic_features(
|
168 |
+
dataset, output_path, cfg, dataset_types, args.num_workers
|
169 |
+
)
|
170 |
+
# Calculate the statistics of acoustic features
|
171 |
+
if cfg.preprocess.mel_min_max_norm:
|
172 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
173 |
+
|
174 |
+
if cfg.preprocess.extract_pitch:
|
175 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
176 |
+
|
177 |
+
if cfg.preprocess.extract_energy:
|
178 |
+
acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
|
179 |
+
|
180 |
+
if cfg.preprocess.pitch_norm:
|
181 |
+
acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
|
182 |
+
|
183 |
+
if cfg.preprocess.energy_norm:
|
184 |
+
acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
|
185 |
+
|
186 |
+
# Copy acoustic features for augmented datasets by creating soft-links
|
187 |
+
for dataset in cfg.dataset:
|
188 |
+
if "pitch_shift" in dataset:
|
189 |
+
src_dataset = dataset.replace("_pitch_shift", "")
|
190 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
191 |
+
elif "formant_shift" in dataset:
|
192 |
+
src_dataset = dataset.replace("_formant_shift", "")
|
193 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
194 |
+
elif "equalizer" in dataset:
|
195 |
+
src_dataset = dataset.replace("_equalizer", "")
|
196 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
197 |
+
else:
|
198 |
+
continue
|
199 |
+
dataset_dir = os.path.join(output_path, dataset)
|
200 |
+
metadata = []
|
201 |
+
for split in ["train", "test"] if not "eval" in dataset else ["test"]:
|
202 |
+
metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
|
203 |
+
with open(metadata_file_path, "r") as f:
|
204 |
+
metadata.extend(json.load(f))
|
205 |
+
print("Copying acoustic features for {}...".format(dataset))
|
206 |
+
acoustic_extractor.copy_acoustic_features(
|
207 |
+
metadata, dataset_dir, src_dataset_dir, cfg
|
208 |
+
)
|
209 |
+
if cfg.preprocess.mel_min_max_norm:
|
210 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
211 |
+
|
212 |
+
if cfg.preprocess.extract_pitch:
|
213 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
214 |
+
|
215 |
+
# Prepare the content features
|
216 |
+
for dataset in cfg.dataset:
|
217 |
+
print("Extracting content features for {}...".format(dataset))
|
218 |
+
extract_content_features(
|
219 |
+
dataset, output_path, cfg, dataset_types, args.num_workers
|
220 |
+
)
|
221 |
+
|
222 |
+
# Prepare the phenome squences
|
223 |
+
if cfg.preprocess.extract_phone:
|
224 |
+
for dataset in cfg.dataset:
|
225 |
+
print("Extracting phoneme sequence for {}...".format(dataset))
|
226 |
+
extract_phonme_sequences(dataset, output_path, cfg, dataset_types)
|
227 |
+
|
228 |
+
|
229 |
+
def main():
|
230 |
+
parser = argparse.ArgumentParser()
|
231 |
+
parser.add_argument(
|
232 |
+
"--config", default="config.json", help="json files for configurations."
|
233 |
+
)
|
234 |
+
parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
|
235 |
+
parser.add_argument("--prepare_alignment", type=bool, default=False)
|
236 |
+
|
237 |
+
args = parser.parse_args()
|
238 |
+
cfg = load_config(args.config)
|
239 |
+
|
240 |
+
preprocess(cfg, args)
|
241 |
+
|
242 |
+
|
243 |
+
if __name__ == "__main__":
|
244 |
+
main()
|
bins/tts/train.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
|
11 |
+
from models.tts.vits.vits_trainer import VITSTrainer
|
12 |
+
from models.tts.valle.valle_trainer import VALLETrainer
|
13 |
+
from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
|
14 |
+
from models.tts.valle_v2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
|
15 |
+
from models.tts.valle_v2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
|
16 |
+
from models.tts.jets.jets_trainer import JetsTrainer
|
17 |
+
|
18 |
+
from utils.util import load_config
|
19 |
+
|
20 |
+
|
21 |
+
def build_trainer(args, cfg):
|
22 |
+
supported_trainer = {
|
23 |
+
"FastSpeech2": FastSpeech2Trainer,
|
24 |
+
"VITS": VITSTrainer,
|
25 |
+
"VALLE": VALLETrainer,
|
26 |
+
"NaturalSpeech2": NS2Trainer,
|
27 |
+
"VALLE_V2_AR": VALLE_V2_AR,
|
28 |
+
"VALLE_V2_NAR": VALLE_V2_NAR,
|
29 |
+
"Jets": JetsTrainer,
|
30 |
+
}
|
31 |
+
|
32 |
+
trainer_class = supported_trainer[cfg.model_type]
|
33 |
+
trainer = trainer_class(args, cfg)
|
34 |
+
return trainer
|
35 |
+
|
36 |
+
|
37 |
+
def cuda_relevant(deterministic=False):
|
38 |
+
torch.cuda.empty_cache()
|
39 |
+
# TF32 on Ampere and above
|
40 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
41 |
+
torch.backends.cudnn.enabled = True
|
42 |
+
torch.backends.cudnn.benchmark = False
|
43 |
+
torch.backends.cudnn.allow_tf32 = True
|
44 |
+
# Deterministic
|
45 |
+
torch.backends.cudnn.deterministic = deterministic
|
46 |
+
torch.backends.cudnn.benchmark = not deterministic
|
47 |
+
torch.use_deterministic_algorithms(deterministic)
|
48 |
+
|
49 |
+
|
50 |
+
def main():
|
51 |
+
parser = argparse.ArgumentParser()
|
52 |
+
parser.add_argument(
|
53 |
+
"--config",
|
54 |
+
default="config.json",
|
55 |
+
help="json files for configurations.",
|
56 |
+
required=True,
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--seed",
|
60 |
+
type=int,
|
61 |
+
default=1234,
|
62 |
+
help="random seed",
|
63 |
+
required=False,
|
64 |
+
)
|
65 |
+
parser.add_argument(
|
66 |
+
"--exp_name",
|
67 |
+
type=str,
|
68 |
+
default="exp_name",
|
69 |
+
help="A specific name to note the experiment",
|
70 |
+
required=True,
|
71 |
+
)
|
72 |
+
parser.add_argument(
|
73 |
+
"--resume", action="store_true", help="The model name to restore"
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
"--test", action="store_true", default=False, help="Test the model"
|
77 |
+
)
|
78 |
+
parser.add_argument(
|
79 |
+
"--log_level", default="warning", help="logging level (debug, info, warning)"
|
80 |
+
)
|
81 |
+
parser.add_argument(
|
82 |
+
"--resume_type",
|
83 |
+
type=str,
|
84 |
+
default="resume",
|
85 |
+
help="Resume training or finetuning.",
|
86 |
+
)
|
87 |
+
parser.add_argument(
|
88 |
+
"--checkpoint_path",
|
89 |
+
type=str,
|
90 |
+
default=None,
|
91 |
+
help="Checkpoint for resume training or finetuning.",
|
92 |
+
)
|
93 |
+
parser.add_argument(
|
94 |
+
"--resume_from_ckpt_path",
|
95 |
+
type=str,
|
96 |
+
default="",
|
97 |
+
help="Checkpoint for resume training or finetuning.",
|
98 |
+
)
|
99 |
+
# VALLETrainer.add_arguments(parser)
|
100 |
+
args = parser.parse_args()
|
101 |
+
cfg = load_config(args.config)
|
102 |
+
|
103 |
+
# Data Augmentation
|
104 |
+
if hasattr(cfg, "preprocess"):
|
105 |
+
if hasattr(cfg.preprocess, "data_augment"):
|
106 |
+
if (
|
107 |
+
type(cfg.preprocess.data_augment) == list
|
108 |
+
and len(cfg.preprocess.data_augment) > 0
|
109 |
+
):
|
110 |
+
new_datasets_list = []
|
111 |
+
for dataset in cfg.preprocess.data_augment:
|
112 |
+
new_datasets = [
|
113 |
+
(
|
114 |
+
f"{dataset}_pitch_shift"
|
115 |
+
if cfg.preprocess.use_pitch_shift
|
116 |
+
else None
|
117 |
+
),
|
118 |
+
(
|
119 |
+
f"{dataset}_formant_shift"
|
120 |
+
if cfg.preprocess.use_formant_shift
|
121 |
+
else None
|
122 |
+
),
|
123 |
+
(
|
124 |
+
f"{dataset}_equalizer"
|
125 |
+
if cfg.preprocess.use_equalizer
|
126 |
+
else None
|
127 |
+
),
|
128 |
+
(
|
129 |
+
f"{dataset}_time_stretch"
|
130 |
+
if cfg.preprocess.use_time_stretch
|
131 |
+
else None
|
132 |
+
),
|
133 |
+
]
|
134 |
+
new_datasets_list.extend(filter(None, new_datasets))
|
135 |
+
cfg.dataset.extend(new_datasets_list)
|
136 |
+
|
137 |
+
print("experiment name: ", args.exp_name)
|
138 |
+
# # CUDA settings
|
139 |
+
cuda_relevant()
|
140 |
+
|
141 |
+
# Build trainer
|
142 |
+
print(f"Building {cfg.model_type} trainer")
|
143 |
+
trainer = build_trainer(args, cfg)
|
144 |
+
print(f"Start training {cfg.model_type} model")
|
145 |
+
if args.test:
|
146 |
+
trainer.test_loop()
|
147 |
+
else:
|
148 |
+
trainer.train_loop()
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
main()
|
bins/vocoder/inference.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import os
|
8 |
+
|
9 |
+
import torch
|
10 |
+
|
11 |
+
from models.vocoders.vocoder_inference import VocoderInference
|
12 |
+
from utils.util import load_config
|
13 |
+
|
14 |
+
|
15 |
+
def build_inference(args, cfg, infer_type="infer_from_dataset"):
|
16 |
+
supported_inference = {
|
17 |
+
"GANVocoder": VocoderInference,
|
18 |
+
"DiffusionVocoder": VocoderInference,
|
19 |
+
}
|
20 |
+
|
21 |
+
inference_class = supported_inference[cfg.model_type]
|
22 |
+
return inference_class(args, cfg, infer_type)
|
23 |
+
|
24 |
+
|
25 |
+
def cuda_relevant(deterministic=False):
|
26 |
+
torch.cuda.empty_cache()
|
27 |
+
# TF32 on Ampere and above
|
28 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
29 |
+
torch.backends.cudnn.enabled = True
|
30 |
+
torch.backends.cudnn.allow_tf32 = True
|
31 |
+
# Deterministic
|
32 |
+
torch.backends.cudnn.deterministic = deterministic
|
33 |
+
torch.backends.cudnn.benchmark = not deterministic
|
34 |
+
torch.use_deterministic_algorithms(deterministic)
|
35 |
+
|
36 |
+
|
37 |
+
def build_parser():
|
38 |
+
r"""Build argument parser for inference.py.
|
39 |
+
Anything else should be put in an extra config YAML file.
|
40 |
+
"""
|
41 |
+
|
42 |
+
parser = argparse.ArgumentParser()
|
43 |
+
parser.add_argument(
|
44 |
+
"--config",
|
45 |
+
type=str,
|
46 |
+
required=True,
|
47 |
+
help="JSON/YAML file for configurations.",
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"--infer_mode",
|
51 |
+
type=str,
|
52 |
+
required=None,
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--infer_datasets",
|
56 |
+
nargs="+",
|
57 |
+
default=None,
|
58 |
+
)
|
59 |
+
parser.add_argument(
|
60 |
+
"--feature_folder",
|
61 |
+
type=str,
|
62 |
+
default=None,
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"--audio_folder",
|
66 |
+
type=str,
|
67 |
+
default=None,
|
68 |
+
)
|
69 |
+
parser.add_argument(
|
70 |
+
"--vocoder_dir",
|
71 |
+
type=str,
|
72 |
+
required=True,
|
73 |
+
help="Vocoder checkpoint directory. Searching behavior is the same as "
|
74 |
+
"the acoustics one.",
|
75 |
+
)
|
76 |
+
parser.add_argument(
|
77 |
+
"--output_dir",
|
78 |
+
type=str,
|
79 |
+
default="result",
|
80 |
+
help="Output directory. Default: ./result",
|
81 |
+
)
|
82 |
+
parser.add_argument(
|
83 |
+
"--log_level",
|
84 |
+
type=str,
|
85 |
+
default="warning",
|
86 |
+
help="Logging level. Default: warning",
|
87 |
+
)
|
88 |
+
parser.add_argument(
|
89 |
+
"--keep_cache",
|
90 |
+
action="store_true",
|
91 |
+
default=False,
|
92 |
+
help="Keep cache files. Only applicable to inference from files.",
|
93 |
+
)
|
94 |
+
return parser
|
95 |
+
|
96 |
+
|
97 |
+
def main():
|
98 |
+
# Parse arguments
|
99 |
+
args = build_parser().parse_args()
|
100 |
+
|
101 |
+
# Parse config
|
102 |
+
cfg = load_config(args.config)
|
103 |
+
|
104 |
+
# CUDA settings
|
105 |
+
cuda_relevant()
|
106 |
+
|
107 |
+
# Build inference
|
108 |
+
trainer = build_inference(args, cfg, args.infer_mode)
|
109 |
+
|
110 |
+
# Run inference
|
111 |
+
trainer.inference()
|
112 |
+
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
main()
|
bins/vocoder/preprocess.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import faulthandler
|
7 |
+
|
8 |
+
faulthandler.enable()
|
9 |
+
|
10 |
+
import os
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
import pyworld as pw
|
14 |
+
from multiprocessing import cpu_count
|
15 |
+
|
16 |
+
|
17 |
+
from utils.util import load_config
|
18 |
+
from preprocessors.processor import preprocess_dataset, prepare_align
|
19 |
+
from preprocessors.metadata import cal_metadata
|
20 |
+
from processors import acoustic_extractor, content_extractor, data_augment
|
21 |
+
|
22 |
+
|
23 |
+
def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
|
24 |
+
"""Extract acoustic features of utterances in the dataset
|
25 |
+
|
26 |
+
Args:
|
27 |
+
dataset (str): name of dataset, e.g. opencpop
|
28 |
+
output_path (str): directory that stores train, test and feature files of datasets
|
29 |
+
cfg (dict): dictionary that stores configurations
|
30 |
+
n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
|
31 |
+
"""
|
32 |
+
types = ["train", "test"] if "eval" not in dataset else ["test"]
|
33 |
+
metadata = []
|
34 |
+
for dataset_type in types:
|
35 |
+
dataset_output = os.path.join(output_path, dataset)
|
36 |
+
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
|
37 |
+
with open(dataset_file, "r") as f:
|
38 |
+
metadata.extend(json.load(f))
|
39 |
+
|
40 |
+
acoustic_extractor.extract_utt_acoustic_features_serial(
|
41 |
+
metadata, dataset_output, cfg
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
def preprocess(cfg, args):
|
46 |
+
"""Proprocess raw data of single or multiple datasets (in cfg.dataset)
|
47 |
+
|
48 |
+
Args:
|
49 |
+
cfg (dict): dictionary that stores configurations
|
50 |
+
args (ArgumentParser): specify the configuration file and num_workers
|
51 |
+
"""
|
52 |
+
# Specify the output root path to save the processed data
|
53 |
+
output_path = cfg.preprocess.processed_dir
|
54 |
+
os.makedirs(output_path, exist_ok=True)
|
55 |
+
|
56 |
+
## Split train and test sets
|
57 |
+
for dataset in cfg.dataset:
|
58 |
+
print("Preprocess {}...".format(dataset))
|
59 |
+
|
60 |
+
preprocess_dataset(
|
61 |
+
dataset,
|
62 |
+
cfg.dataset_path[dataset],
|
63 |
+
output_path,
|
64 |
+
cfg.preprocess,
|
65 |
+
cfg.task_type,
|
66 |
+
is_custom_dataset=dataset in cfg.use_custom_dataset,
|
67 |
+
)
|
68 |
+
|
69 |
+
# Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
|
70 |
+
try:
|
71 |
+
assert isinstance(
|
72 |
+
cfg.preprocess.data_augment, list
|
73 |
+
), "Please provide a list of datasets need to be augmented."
|
74 |
+
if len(cfg.preprocess.data_augment) > 0:
|
75 |
+
new_datasets_list = []
|
76 |
+
for dataset in cfg.preprocess.data_augment:
|
77 |
+
new_datasets = data_augment.augment_dataset(cfg, dataset)
|
78 |
+
new_datasets_list.extend(new_datasets)
|
79 |
+
cfg.dataset.extend(new_datasets_list)
|
80 |
+
print("Augmentation datasets: ", cfg.dataset)
|
81 |
+
except:
|
82 |
+
print("No Data Augmentation.")
|
83 |
+
|
84 |
+
# Dump metadata of datasets (singers, train/test durations, etc.)
|
85 |
+
cal_metadata(cfg)
|
86 |
+
|
87 |
+
## Prepare the acoustic features
|
88 |
+
for dataset in cfg.dataset:
|
89 |
+
# Skip augmented datasets which do not need to extract acoustic features
|
90 |
+
# We will copy acoustic features from the original dataset later
|
91 |
+
if (
|
92 |
+
"pitch_shift" in dataset
|
93 |
+
or "formant_shift" in dataset
|
94 |
+
or "equalizer" in dataset in dataset
|
95 |
+
):
|
96 |
+
continue
|
97 |
+
print(
|
98 |
+
"Extracting acoustic features for {} using {} workers ...".format(
|
99 |
+
dataset, args.num_workers
|
100 |
+
)
|
101 |
+
)
|
102 |
+
extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
|
103 |
+
# Calculate the statistics of acoustic features
|
104 |
+
if cfg.preprocess.mel_min_max_norm:
|
105 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
106 |
+
|
107 |
+
# Copy acoustic features for augmented datasets by creating soft-links
|
108 |
+
for dataset in cfg.dataset:
|
109 |
+
if "pitch_shift" in dataset:
|
110 |
+
src_dataset = dataset.replace("_pitch_shift", "")
|
111 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
112 |
+
elif "formant_shift" in dataset:
|
113 |
+
src_dataset = dataset.replace("_formant_shift", "")
|
114 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
115 |
+
elif "equalizer" in dataset:
|
116 |
+
src_dataset = dataset.replace("_equalizer", "")
|
117 |
+
src_dataset_dir = os.path.join(output_path, src_dataset)
|
118 |
+
else:
|
119 |
+
continue
|
120 |
+
dataset_dir = os.path.join(output_path, dataset)
|
121 |
+
metadata = []
|
122 |
+
for split in ["train", "test"] if not "eval" in dataset else ["test"]:
|
123 |
+
metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
|
124 |
+
with open(metadata_file_path, "r") as f:
|
125 |
+
metadata.extend(json.load(f))
|
126 |
+
print("Copying acoustic features for {}...".format(dataset))
|
127 |
+
acoustic_extractor.copy_acoustic_features(
|
128 |
+
metadata, dataset_dir, src_dataset_dir, cfg
|
129 |
+
)
|
130 |
+
if cfg.preprocess.mel_min_max_norm:
|
131 |
+
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
|
132 |
+
|
133 |
+
if cfg.preprocess.extract_pitch:
|
134 |
+
acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
|
135 |
+
|
136 |
+
|
137 |
+
def main():
|
138 |
+
parser = argparse.ArgumentParser()
|
139 |
+
parser.add_argument(
|
140 |
+
"--config", default="config.json", help="json files for configurations."
|
141 |
+
)
|
142 |
+
parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
|
143 |
+
|
144 |
+
args = parser.parse_args()
|
145 |
+
cfg = load_config(args.config)
|
146 |
+
|
147 |
+
preprocess(cfg, args)
|
148 |
+
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
main()
|
bins/vocoder/train.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
|
11 |
+
from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
|
12 |
+
|
13 |
+
from utils.util import load_config
|
14 |
+
|
15 |
+
|
16 |
+
def build_trainer(args, cfg):
|
17 |
+
supported_trainer = {
|
18 |
+
"GANVocoder": GANVocoderTrainer,
|
19 |
+
"DiffusionVocoder": DiffusionVocoderTrainer,
|
20 |
+
}
|
21 |
+
|
22 |
+
trainer_class = supported_trainer[cfg.model_type]
|
23 |
+
trainer = trainer_class(args, cfg)
|
24 |
+
return trainer
|
25 |
+
|
26 |
+
|
27 |
+
def cuda_relevant(deterministic=False):
|
28 |
+
torch.cuda.empty_cache()
|
29 |
+
# TF32 on Ampere and above
|
30 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
31 |
+
torch.backends.cudnn.enabled = True
|
32 |
+
torch.backends.cudnn.allow_tf32 = True
|
33 |
+
# Deterministic
|
34 |
+
torch.backends.cudnn.deterministic = deterministic
|
35 |
+
torch.backends.cudnn.benchmark = not deterministic
|
36 |
+
torch.use_deterministic_algorithms(deterministic)
|
37 |
+
|
38 |
+
|
39 |
+
def main():
|
40 |
+
parser = argparse.ArgumentParser()
|
41 |
+
parser.add_argument(
|
42 |
+
"--config",
|
43 |
+
default="config.json",
|
44 |
+
help="json files for configurations.",
|
45 |
+
required=True,
|
46 |
+
)
|
47 |
+
parser.add_argument(
|
48 |
+
"--exp_name",
|
49 |
+
type=str,
|
50 |
+
default="exp_name",
|
51 |
+
help="A specific name to note the experiment",
|
52 |
+
required=True,
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--resume_type",
|
56 |
+
type=str,
|
57 |
+
help="resume for continue to train, finetune for finetuning",
|
58 |
+
)
|
59 |
+
parser.add_argument(
|
60 |
+
"--checkpoint",
|
61 |
+
type=str,
|
62 |
+
help="checkpoint to resume",
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"--log_level", default="warning", help="logging level (debug, info, warning)"
|
66 |
+
)
|
67 |
+
args = parser.parse_args()
|
68 |
+
cfg = load_config(args.config)
|
69 |
+
|
70 |
+
# Data Augmentation
|
71 |
+
if cfg.preprocess.data_augment:
|
72 |
+
new_datasets_list = []
|
73 |
+
for dataset in cfg.preprocess.data_augment:
|
74 |
+
new_datasets = [
|
75 |
+
# f"{dataset}_pitch_shift",
|
76 |
+
# f"{dataset}_formant_shift",
|
77 |
+
f"{dataset}_equalizer",
|
78 |
+
f"{dataset}_time_stretch",
|
79 |
+
]
|
80 |
+
new_datasets_list.extend(new_datasets)
|
81 |
+
cfg.dataset.extend(new_datasets_list)
|
82 |
+
|
83 |
+
# CUDA settings
|
84 |
+
cuda_relevant()
|
85 |
+
|
86 |
+
# Build trainer
|
87 |
+
trainer = build_trainer(args, cfg)
|
88 |
+
|
89 |
+
trainer.train_loop()
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
main()
|
config/audioldm.json
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "AudioLDM",
|
4 |
+
"task_type": "tta",
|
5 |
+
"dataset": [
|
6 |
+
"AudioCaps"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
// feature used for model training
|
10 |
+
"use_spkid": false,
|
11 |
+
"use_uv": false,
|
12 |
+
"use_frame_pitch": false,
|
13 |
+
"use_phone_pitch": false,
|
14 |
+
"use_frame_energy": false,
|
15 |
+
"use_phone_energy": false,
|
16 |
+
"use_mel": false,
|
17 |
+
"use_audio": false,
|
18 |
+
"use_label": false,
|
19 |
+
"use_one_hot": false,
|
20 |
+
"cond_mask_prob": 0.1
|
21 |
+
},
|
22 |
+
// model
|
23 |
+
"model": {
|
24 |
+
"audioldm": {
|
25 |
+
"image_size": 32,
|
26 |
+
"in_channels": 4,
|
27 |
+
"out_channels": 4,
|
28 |
+
"model_channels": 256,
|
29 |
+
"attention_resolutions": [
|
30 |
+
4,
|
31 |
+
2,
|
32 |
+
1
|
33 |
+
],
|
34 |
+
"num_res_blocks": 2,
|
35 |
+
"channel_mult": [
|
36 |
+
1,
|
37 |
+
2,
|
38 |
+
4
|
39 |
+
],
|
40 |
+
"num_heads": 8,
|
41 |
+
"use_spatial_transformer": true,
|
42 |
+
"transformer_depth": 1,
|
43 |
+
"context_dim": 768,
|
44 |
+
"use_checkpoint": true,
|
45 |
+
"legacy": false
|
46 |
+
},
|
47 |
+
"autoencoderkl": {
|
48 |
+
"ch": 128,
|
49 |
+
"ch_mult": [
|
50 |
+
1,
|
51 |
+
1,
|
52 |
+
2,
|
53 |
+
2,
|
54 |
+
4
|
55 |
+
],
|
56 |
+
"num_res_blocks": 2,
|
57 |
+
"in_channels": 1,
|
58 |
+
"z_channels": 4,
|
59 |
+
"out_ch": 1,
|
60 |
+
"double_z": true
|
61 |
+
},
|
62 |
+
"noise_scheduler": {
|
63 |
+
"num_train_timesteps": 1000,
|
64 |
+
"beta_start": 0.00085,
|
65 |
+
"beta_end": 0.012,
|
66 |
+
"beta_schedule": "scaled_linear",
|
67 |
+
"clip_sample": false,
|
68 |
+
"steps_offset": 1,
|
69 |
+
"set_alpha_to_one": false,
|
70 |
+
"skip_prk_steps": true,
|
71 |
+
"prediction_type": "epsilon"
|
72 |
+
}
|
73 |
+
},
|
74 |
+
// train
|
75 |
+
"train": {
|
76 |
+
"lronPlateau": {
|
77 |
+
"factor": 0.9,
|
78 |
+
"patience": 100,
|
79 |
+
"min_lr": 4.0e-5,
|
80 |
+
"verbose": true
|
81 |
+
},
|
82 |
+
"adam": {
|
83 |
+
"lr": 5.0e-5,
|
84 |
+
"betas": [
|
85 |
+
0.9,
|
86 |
+
0.999
|
87 |
+
],
|
88 |
+
"weight_decay": 1.0e-2,
|
89 |
+
"eps": 1.0e-8
|
90 |
+
}
|
91 |
+
}
|
92 |
+
}
|
config/autoencoderkl.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "AutoencoderKL",
|
4 |
+
"task_type": "tta",
|
5 |
+
"dataset": [
|
6 |
+
"AudioCaps"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
// feature used for model training
|
10 |
+
"use_spkid": false,
|
11 |
+
"use_uv": false,
|
12 |
+
"use_frame_pitch": false,
|
13 |
+
"use_phone_pitch": false,
|
14 |
+
"use_frame_energy": false,
|
15 |
+
"use_phone_energy": false,
|
16 |
+
"use_mel": false,
|
17 |
+
"use_audio": false,
|
18 |
+
"use_label": false,
|
19 |
+
"use_one_hot": false
|
20 |
+
},
|
21 |
+
// model
|
22 |
+
"model": {
|
23 |
+
"autoencoderkl": {
|
24 |
+
"ch": 128,
|
25 |
+
"ch_mult": [
|
26 |
+
1,
|
27 |
+
1,
|
28 |
+
2,
|
29 |
+
2,
|
30 |
+
4
|
31 |
+
],
|
32 |
+
"num_res_blocks": 2,
|
33 |
+
"in_channels": 1,
|
34 |
+
"z_channels": 4,
|
35 |
+
"out_ch": 1,
|
36 |
+
"double_z": true
|
37 |
+
},
|
38 |
+
"loss": {
|
39 |
+
"kl_weight": 1e-8,
|
40 |
+
"disc_weight": 0.5,
|
41 |
+
"disc_factor": 1.0,
|
42 |
+
"logvar_init": 0.0,
|
43 |
+
"min_adapt_d_weight": 0.0,
|
44 |
+
"max_adapt_d_weight": 10.0,
|
45 |
+
"disc_start": 50001,
|
46 |
+
"disc_in_channels": 1,
|
47 |
+
"disc_num_layers": 3,
|
48 |
+
"use_actnorm": false
|
49 |
+
}
|
50 |
+
},
|
51 |
+
// train
|
52 |
+
"train": {
|
53 |
+
"lronPlateau": {
|
54 |
+
"factor": 0.9,
|
55 |
+
"patience": 100,
|
56 |
+
"min_lr": 4.0e-5,
|
57 |
+
"verbose": true
|
58 |
+
},
|
59 |
+
"adam": {
|
60 |
+
"lr": 4.0e-4,
|
61 |
+
"betas": [
|
62 |
+
0.9,
|
63 |
+
0.999
|
64 |
+
],
|
65 |
+
"weight_decay": 1.0e-2,
|
66 |
+
"eps": 1.0e-8
|
67 |
+
}
|
68 |
+
}
|
69 |
+
}
|
config/base.json
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"supported_model_type": [
|
3 |
+
"GANVocoder",
|
4 |
+
"Fastspeech2",
|
5 |
+
"DiffSVC",
|
6 |
+
"Transformer",
|
7 |
+
"EDM",
|
8 |
+
"CD"
|
9 |
+
],
|
10 |
+
"task_type": "",
|
11 |
+
"dataset": [],
|
12 |
+
"use_custom_dataset": [],
|
13 |
+
"preprocess": {
|
14 |
+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
|
15 |
+
// trim audio silence
|
16 |
+
"data_augment": false,
|
17 |
+
"trim_silence": false,
|
18 |
+
"num_silent_frames": 8,
|
19 |
+
"trim_fft_size": 512, // fft size used in trimming
|
20 |
+
"trim_hop_size": 128, // hop size used in trimming
|
21 |
+
"trim_top_db": 30, // top db used in trimming sensitive to each dataset
|
22 |
+
// acoustic features
|
23 |
+
"extract_mel": false,
|
24 |
+
"mel_extract_mode": "",
|
25 |
+
"extract_linear_spec": false,
|
26 |
+
"extract_mcep": false,
|
27 |
+
"extract_pitch": false,
|
28 |
+
"extract_acoustic_token": false,
|
29 |
+
"pitch_remove_outlier": false,
|
30 |
+
"extract_uv": false,
|
31 |
+
"pitch_norm": false,
|
32 |
+
"extract_audio": false,
|
33 |
+
"extract_label": false,
|
34 |
+
"pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
|
35 |
+
"extract_energy": false,
|
36 |
+
"energy_remove_outlier": false,
|
37 |
+
"energy_norm": false,
|
38 |
+
"energy_extract_mode": "from_mel",
|
39 |
+
"extract_duration": false,
|
40 |
+
"extract_amplitude_phase": false,
|
41 |
+
"mel_min_max_norm": false,
|
42 |
+
// lingusitic features
|
43 |
+
"extract_phone": false,
|
44 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
45 |
+
// content features
|
46 |
+
"extract_whisper_feature": false,
|
47 |
+
"extract_contentvec_feature": false,
|
48 |
+
"extract_mert_feature": false,
|
49 |
+
"extract_wenet_feature": false,
|
50 |
+
// Settings for data preprocessing
|
51 |
+
"n_mel": 80,
|
52 |
+
"win_size": 480,
|
53 |
+
"hop_size": 120,
|
54 |
+
"sample_rate": 24000,
|
55 |
+
"n_fft": 1024,
|
56 |
+
"fmin": 0,
|
57 |
+
"fmax": 12000,
|
58 |
+
"min_level_db": -115,
|
59 |
+
"ref_level_db": 20,
|
60 |
+
"bits": 8,
|
61 |
+
// Directory names of processed data or extracted features
|
62 |
+
"processed_dir": "processed_data",
|
63 |
+
"trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
|
64 |
+
"raw_data": "raw_data",
|
65 |
+
"phone_dir": "phones",
|
66 |
+
"wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
|
67 |
+
"audio_dir": "audios",
|
68 |
+
"log_amplitude_dir": "log_amplitudes",
|
69 |
+
"phase_dir": "phases",
|
70 |
+
"real_dir": "reals",
|
71 |
+
"imaginary_dir": "imaginarys",
|
72 |
+
"label_dir": "labels",
|
73 |
+
"linear_dir": "linears",
|
74 |
+
"mel_dir": "mels", // directory name of extraced mel features
|
75 |
+
"mcep_dir": "mcep", // directory name of extraced mcep features
|
76 |
+
"dur_dir": "durs",
|
77 |
+
"symbols_dict": "symbols.dict",
|
78 |
+
"lab_dir": "labs", // directory name of extraced label features
|
79 |
+
"wenet_dir": "wenet", // directory name of extraced wenet features
|
80 |
+
"contentvec_dir": "contentvec", // directory name of extraced wenet features
|
81 |
+
"pitch_dir": "pitches", // directory name of extraced pitch features
|
82 |
+
"energy_dir": "energys", // directory name of extracted energy features
|
83 |
+
"phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
|
84 |
+
"phone_energy_dir": "phone_energys", // directory name of extracted energy features
|
85 |
+
"uv_dir": "uvs", // directory name of extracted unvoiced features
|
86 |
+
"duration_dir": "duration", // ground-truth duration file
|
87 |
+
"phone_seq_file": "phone_seq_file", // phoneme sequence file
|
88 |
+
"file_lst": "file.lst",
|
89 |
+
"train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
|
90 |
+
"valid_file": "valid.json", // validattion set
|
91 |
+
"spk2id": "spk2id.json", // used for multi-speaker dataset
|
92 |
+
"utt2spk": "utt2spk", // used for multi-speaker dataset
|
93 |
+
"emo2id": "emo2id.json", // used for multi-emotion dataset
|
94 |
+
"utt2emo": "utt2emo", // used for multi-emotion dataset
|
95 |
+
// Features used for model training
|
96 |
+
"use_text": false,
|
97 |
+
"use_phone": false,
|
98 |
+
"use_phn_seq": false,
|
99 |
+
"use_lab": false,
|
100 |
+
"use_linear": false,
|
101 |
+
"use_mel": false,
|
102 |
+
"use_min_max_norm_mel": false,
|
103 |
+
"use_wav": false,
|
104 |
+
"use_phone_pitch": false,
|
105 |
+
"use_log_scale_pitch": false,
|
106 |
+
"use_phone_energy": false,
|
107 |
+
"use_phone_duration": false,
|
108 |
+
"use_log_scale_energy": false,
|
109 |
+
"use_wenet": false,
|
110 |
+
"use_dur": false,
|
111 |
+
"use_spkid": false, // True: use speaker id for multi-speaker dataset
|
112 |
+
"use_emoid": false, // True: use emotion id for multi-emotion dataset
|
113 |
+
"use_frame_pitch": false,
|
114 |
+
"use_uv": false,
|
115 |
+
"use_frame_energy": false,
|
116 |
+
"use_frame_duration": false,
|
117 |
+
"use_audio": false,
|
118 |
+
"use_label": false,
|
119 |
+
"use_one_hot": false,
|
120 |
+
"use_amplitude_phase": false,
|
121 |
+
"align_mel_duration": false
|
122 |
+
},
|
123 |
+
"train": {
|
124 |
+
"ddp": true,
|
125 |
+
"batch_size": 16,
|
126 |
+
"max_steps": 1000000,
|
127 |
+
// Trackers
|
128 |
+
"tracker": [
|
129 |
+
"tensorboard"
|
130 |
+
// "wandb",
|
131 |
+
// "cometml",
|
132 |
+
// "mlflow",
|
133 |
+
],
|
134 |
+
"max_epoch": -1,
|
135 |
+
// -1 means no limit
|
136 |
+
"save_checkpoint_stride": [
|
137 |
+
5,
|
138 |
+
20
|
139 |
+
],
|
140 |
+
// unit is epoch
|
141 |
+
"keep_last": [
|
142 |
+
3,
|
143 |
+
-1
|
144 |
+
],
|
145 |
+
// -1 means infinite, if one number will broadcast
|
146 |
+
"run_eval": [
|
147 |
+
false,
|
148 |
+
true
|
149 |
+
],
|
150 |
+
// if one number will broadcast
|
151 |
+
// Fix the random seed
|
152 |
+
"random_seed": 10086,
|
153 |
+
// Optimizer
|
154 |
+
"optimizer": "AdamW",
|
155 |
+
"adamw": {
|
156 |
+
"lr": 4.0e-4
|
157 |
+
// nn model lr
|
158 |
+
},
|
159 |
+
// LR Scheduler
|
160 |
+
"scheduler": "ReduceLROnPlateau",
|
161 |
+
"reducelronplateau": {
|
162 |
+
"factor": 0.8,
|
163 |
+
"patience": 10,
|
164 |
+
// unit is epoch
|
165 |
+
"min_lr": 1.0e-4
|
166 |
+
},
|
167 |
+
// Batchsampler
|
168 |
+
"sampler": {
|
169 |
+
"holistic_shuffle": true,
|
170 |
+
"drop_last": true
|
171 |
+
},
|
172 |
+
// Dataloader
|
173 |
+
"dataloader": {
|
174 |
+
"num_worker": 32,
|
175 |
+
"pin_memory": true
|
176 |
+
},
|
177 |
+
"gradient_accumulation_step": 1,
|
178 |
+
"total_training_steps": 50000,
|
179 |
+
"save_summary_steps": 500,
|
180 |
+
"save_checkpoints_steps": 10000,
|
181 |
+
"valid_interval": 10000,
|
182 |
+
"keep_checkpoint_max": 5,
|
183 |
+
"multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model;
|
184 |
+
}
|
185 |
+
}
|
config/comosvc.json
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/svc/base.json",
|
3 |
+
"model_type": "DiffComoSVC",
|
4 |
+
"task_type": "svc",
|
5 |
+
"preprocess": {
|
6 |
+
// data augmentations
|
7 |
+
"use_pitch_shift": false,
|
8 |
+
"use_formant_shift": false,
|
9 |
+
"use_time_stretch": false,
|
10 |
+
"use_equalizer": false,
|
11 |
+
// acoustic features
|
12 |
+
"extract_mel": true,
|
13 |
+
"mel_min_max_norm": true,
|
14 |
+
"extract_pitch": true,
|
15 |
+
"pitch_extractor": "parselmouth",
|
16 |
+
"extract_uv": true,
|
17 |
+
"extract_energy": true,
|
18 |
+
// content features
|
19 |
+
"extract_whisper_feature": false,
|
20 |
+
"whisper_sample_rate": 16000,
|
21 |
+
"extract_contentvec_feature": false,
|
22 |
+
"contentvec_sample_rate": 16000,
|
23 |
+
"extract_wenet_feature": false,
|
24 |
+
"wenet_sample_rate": 16000,
|
25 |
+
"extract_mert_feature": false,
|
26 |
+
"mert_sample_rate": 16000,
|
27 |
+
// Default config for whisper
|
28 |
+
"whisper_frameshift": 0.01,
|
29 |
+
"whisper_downsample_rate": 2,
|
30 |
+
// Default config for content vector
|
31 |
+
"contentvec_frameshift": 0.02,
|
32 |
+
// Default config for mert
|
33 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
34 |
+
"mert_feature_layer": -1,
|
35 |
+
"mert_hop_size": 320,
|
36 |
+
// 24k
|
37 |
+
"mert_frameshit": 0.01333,
|
38 |
+
// 10ms
|
39 |
+
"wenet_frameshift": 0.01,
|
40 |
+
// wenetspeech is 4, gigaspeech is 6
|
41 |
+
"wenet_downsample_rate": 4,
|
42 |
+
// Default config
|
43 |
+
"n_mel": 100,
|
44 |
+
"win_size": 1024,
|
45 |
+
// todo
|
46 |
+
"hop_size": 256,
|
47 |
+
"sample_rate": 24000,
|
48 |
+
"n_fft": 1024,
|
49 |
+
// todo
|
50 |
+
"fmin": 0,
|
51 |
+
"fmax": 12000,
|
52 |
+
// todo
|
53 |
+
"f0_min": 50,
|
54 |
+
// ~C2
|
55 |
+
"f0_max": 1100,
|
56 |
+
//1100, // ~C6(1100), ~G5(800)
|
57 |
+
"pitch_bin": 256,
|
58 |
+
"pitch_max": 1100.0,
|
59 |
+
"pitch_min": 50.0,
|
60 |
+
"is_label": true,
|
61 |
+
"is_mu_law": true,
|
62 |
+
"bits": 8,
|
63 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
64 |
+
"whisper_dir": "whisper",
|
65 |
+
"contentvec_dir": "contentvec",
|
66 |
+
"wenet_dir": "wenet",
|
67 |
+
"mert_dir": "mert",
|
68 |
+
// Extract content features using dataloader
|
69 |
+
"pin_memory": true,
|
70 |
+
"num_workers": 8,
|
71 |
+
"content_feature_batch_size": 16,
|
72 |
+
// Features used for model training
|
73 |
+
"use_mel": true,
|
74 |
+
"use_min_max_norm_mel": true,
|
75 |
+
"use_frame_pitch": true,
|
76 |
+
"use_uv": true,
|
77 |
+
"use_frame_energy": true,
|
78 |
+
"use_log_scale_pitch": false,
|
79 |
+
"use_log_scale_energy": false,
|
80 |
+
"use_spkid": true,
|
81 |
+
// Meta file
|
82 |
+
"train_file": "train.json",
|
83 |
+
"valid_file": "test.json",
|
84 |
+
"spk2id": "singers.json",
|
85 |
+
"utt2spk": "utt2singer"
|
86 |
+
},
|
87 |
+
"model": {
|
88 |
+
"teacher_model_path": "[Your Teacher Model Path].bin",
|
89 |
+
"condition_encoder": {
|
90 |
+
"merge_mode": "add",
|
91 |
+
"input_melody_dim": 1,
|
92 |
+
"use_log_f0": true,
|
93 |
+
"n_bins_melody": 256,
|
94 |
+
//# Quantization (0 for not quantization)
|
95 |
+
"output_melody_dim": 384,
|
96 |
+
"input_loudness_dim": 1,
|
97 |
+
"use_log_loudness": true,
|
98 |
+
"n_bins_loudness": 256,
|
99 |
+
"output_loudness_dim": 384,
|
100 |
+
"use_whisper": false,
|
101 |
+
"use_contentvec": false,
|
102 |
+
"use_wenet": false,
|
103 |
+
"use_mert": false,
|
104 |
+
"whisper_dim": 1024,
|
105 |
+
"contentvec_dim": 256,
|
106 |
+
"mert_dim": 256,
|
107 |
+
"wenet_dim": 512,
|
108 |
+
"content_encoder_dim": 384,
|
109 |
+
"output_singer_dim": 384,
|
110 |
+
"singer_table_size": 512,
|
111 |
+
"output_content_dim": 384,
|
112 |
+
"use_spkid": true
|
113 |
+
},
|
114 |
+
"comosvc": {
|
115 |
+
"distill": false,
|
116 |
+
// conformer encoder
|
117 |
+
"input_dim": 384,
|
118 |
+
"output_dim": 100,
|
119 |
+
"n_heads": 2,
|
120 |
+
"n_layers": 6,
|
121 |
+
"filter_channels": 512,
|
122 |
+
"dropout": 0.1,
|
123 |
+
// karras diffusion
|
124 |
+
"P_mean": -1.2,
|
125 |
+
"P_std": 1.2,
|
126 |
+
"sigma_data": 0.5,
|
127 |
+
"sigma_min": 0.002,
|
128 |
+
"sigma_max": 80,
|
129 |
+
"rho": 7,
|
130 |
+
"n_timesteps": 18,
|
131 |
+
},
|
132 |
+
"diffusion": {
|
133 |
+
// Diffusion steps encoder
|
134 |
+
"step_encoder": {
|
135 |
+
"dim_raw_embedding": 128,
|
136 |
+
"dim_hidden_layer": 512,
|
137 |
+
"activation": "SiLU",
|
138 |
+
"num_layer": 2,
|
139 |
+
"max_period": 10000
|
140 |
+
},
|
141 |
+
// Diffusion decoder
|
142 |
+
"model_type": "bidilconv",
|
143 |
+
// bidilconv, unet2d, TODO: unet1d
|
144 |
+
"bidilconv": {
|
145 |
+
"base_channel": 384,
|
146 |
+
"n_res_block": 20,
|
147 |
+
"conv_kernel_size": 3,
|
148 |
+
"dilation_cycle_length": 4,
|
149 |
+
// specially, 1 means no dilation
|
150 |
+
"conditioner_size": 100
|
151 |
+
}
|
152 |
+
},
|
153 |
+
},
|
154 |
+
"train": {
|
155 |
+
// Basic settings
|
156 |
+
"fast_steps": 0,
|
157 |
+
"batch_size": 64,
|
158 |
+
"gradient_accumulation_step": 1,
|
159 |
+
"max_epoch": -1,
|
160 |
+
// -1 means no limit
|
161 |
+
"save_checkpoint_stride": [
|
162 |
+
10,
|
163 |
+
100
|
164 |
+
],
|
165 |
+
// unit is epoch
|
166 |
+
"keep_last": [
|
167 |
+
3,
|
168 |
+
-1
|
169 |
+
],
|
170 |
+
// -1 means infinite, if one number will broadcast
|
171 |
+
"run_eval": [
|
172 |
+
false,
|
173 |
+
true
|
174 |
+
],
|
175 |
+
// if one number will broadcast
|
176 |
+
// Fix the random seed
|
177 |
+
"random_seed": 10086,
|
178 |
+
// Batchsampler
|
179 |
+
"sampler": {
|
180 |
+
"holistic_shuffle": true,
|
181 |
+
"drop_last": true
|
182 |
+
},
|
183 |
+
// Dataloader
|
184 |
+
"dataloader": {
|
185 |
+
"num_worker": 32,
|
186 |
+
"pin_memory": true
|
187 |
+
},
|
188 |
+
// Trackers
|
189 |
+
"tracker": [
|
190 |
+
"tensorboard"
|
191 |
+
// "wandb",
|
192 |
+
// "cometml",
|
193 |
+
// "mlflow",
|
194 |
+
],
|
195 |
+
// Optimizer
|
196 |
+
"optimizer": "AdamW",
|
197 |
+
"adamw": {
|
198 |
+
"lr": 5.0e-5
|
199 |
+
// nn model lr
|
200 |
+
},
|
201 |
+
// LR Scheduler
|
202 |
+
"scheduler": "ReduceLROnPlateau",
|
203 |
+
"reducelronplateau": {
|
204 |
+
"factor": 0.8,
|
205 |
+
"patience": 10,
|
206 |
+
// unit is epoch
|
207 |
+
"min_lr": 5.0e-6
|
208 |
+
}
|
209 |
+
},
|
210 |
+
"inference": {
|
211 |
+
"comosvc": {
|
212 |
+
"inference_steps": 40
|
213 |
+
}
|
214 |
+
}
|
215 |
+
}
|
config/facodec.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"exp_name": "facodec",
|
3 |
+
"model_type": "FAcodec",
|
4 |
+
"log_dir": "./runs/",
|
5 |
+
"log_interval": 10,
|
6 |
+
"save_interval": 1000,
|
7 |
+
"device": "cuda",
|
8 |
+
"epochs": 1000,
|
9 |
+
"batch_size": 4,
|
10 |
+
"batch_length": 100,
|
11 |
+
"max_len": 80,
|
12 |
+
"pretrained_model": "",
|
13 |
+
"load_only_params": false,
|
14 |
+
"F0_path": "modules/JDC/bst.t7",
|
15 |
+
"dataset": "dummy",
|
16 |
+
"preprocess_params": {
|
17 |
+
"sr": 24000,
|
18 |
+
"frame_rate": 80,
|
19 |
+
"duration_range": [1.0, 25.0],
|
20 |
+
"spect_params": {
|
21 |
+
"n_fft": 2048,
|
22 |
+
"win_length": 1200,
|
23 |
+
"hop_length": 300,
|
24 |
+
"n_mels": 80,
|
25 |
+
},
|
26 |
+
},
|
27 |
+
"train": {
|
28 |
+
"gradient_accumulation_step": 1,
|
29 |
+
"batch_size": 1,
|
30 |
+
"save_checkpoint_stride": [20],
|
31 |
+
"random_seed": 1234,
|
32 |
+
"max_epoch": -1,
|
33 |
+
"max_frame_len": 80,
|
34 |
+
"tracker": ["tensorboard"],
|
35 |
+
"run_eval": [false],
|
36 |
+
"sampler": {"holistic_shuffle": true, "drop_last": true},
|
37 |
+
"dataloader": {"num_worker": 0, "pin_memory": true},
|
38 |
+
},
|
39 |
+
"model_params": {
|
40 |
+
"causal": true,
|
41 |
+
"lstm": 2,
|
42 |
+
"norm_f0": true,
|
43 |
+
"use_gr_content_f0": false,
|
44 |
+
"use_gr_prosody_phone": false,
|
45 |
+
"use_gr_timbre_prosody": false,
|
46 |
+
"separate_prosody_encoder": true,
|
47 |
+
"n_c_codebooks": 2,
|
48 |
+
"timbre_norm": true,
|
49 |
+
"use_gr_content_global_f0": true,
|
50 |
+
"DAC": {
|
51 |
+
"encoder_dim": 64,
|
52 |
+
"encoder_rates": [2, 5, 5, 6],
|
53 |
+
"decoder_dim": 1536,
|
54 |
+
"decoder_rates": [6, 5, 5, 2],
|
55 |
+
"sr": 24000,
|
56 |
+
},
|
57 |
+
},
|
58 |
+
"loss_params": {
|
59 |
+
"base_lr": 0.0001,
|
60 |
+
"warmup_steps": 200,
|
61 |
+
"discriminator_iter_start": 2000,
|
62 |
+
"lambda_spk": 1.0,
|
63 |
+
"lambda_mel": 45,
|
64 |
+
"lambda_f0": 1.0,
|
65 |
+
"lambda_uv": 1.0,
|
66 |
+
},
|
67 |
+
}
|
config/fs2.json
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "FastSpeech2",
|
4 |
+
"task_type": "tts",
|
5 |
+
"dataset": ["LJSpeech"],
|
6 |
+
"preprocess": {
|
7 |
+
// acoustic features
|
8 |
+
"extract_audio": true,
|
9 |
+
"extract_mel": true,
|
10 |
+
"mel_extract_mode": "taco",
|
11 |
+
"mel_min_max_norm": false,
|
12 |
+
"extract_pitch": true,
|
13 |
+
"extract_uv": false,
|
14 |
+
"pitch_extractor": "dio",
|
15 |
+
"extract_energy": true,
|
16 |
+
"energy_extract_mode": "from_tacotron_stft",
|
17 |
+
"extract_duration": true,
|
18 |
+
"use_phone": false,
|
19 |
+
"pitch_norm": true,
|
20 |
+
"energy_norm": true,
|
21 |
+
"pitch_remove_outlier": true,
|
22 |
+
"energy_remove_outlier": true,
|
23 |
+
|
24 |
+
// Default config
|
25 |
+
"n_mel": 80,
|
26 |
+
"win_size": 1024, // todo
|
27 |
+
"hop_size": 256,
|
28 |
+
"sample_rate": 22050,
|
29 |
+
"n_fft": 1024, // todo
|
30 |
+
"fmin": 0,
|
31 |
+
"fmax": 8000, // todo
|
32 |
+
"raw_data": "raw_data",
|
33 |
+
"text_cleaners": ["english_cleaners"],
|
34 |
+
"f0_min": 71, // ~C2
|
35 |
+
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
|
36 |
+
"pitch_bin": 256,
|
37 |
+
"pitch_max": 1100.0,
|
38 |
+
"pitch_min": 50.0,
|
39 |
+
"is_label": true,
|
40 |
+
"is_mu_law": true,
|
41 |
+
"bits": 8,
|
42 |
+
|
43 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
44 |
+
"whisper_dir": "whisper",
|
45 |
+
"content_vector_dir": "content_vector",
|
46 |
+
"wenet_dir": "wenet",
|
47 |
+
"mert_dir": "mert",
|
48 |
+
"spk2id":"spk2id.json",
|
49 |
+
"utt2spk":"utt2spk",
|
50 |
+
"valid_file": "test.json",
|
51 |
+
|
52 |
+
// Features used for model training
|
53 |
+
"use_mel": true,
|
54 |
+
"use_min_max_norm_mel": false,
|
55 |
+
"use_frame_pitch": false,
|
56 |
+
"use_frame_energy": false,
|
57 |
+
"use_phone_pitch": true,
|
58 |
+
"use_phone_energy": true,
|
59 |
+
"use_log_scale_pitch": false,
|
60 |
+
"use_log_scale_energy": false,
|
61 |
+
"use_spkid": false,
|
62 |
+
"align_mel_duration": true,
|
63 |
+
"text_cleaners": ["english_cleaners"],
|
64 |
+
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
65 |
+
},
|
66 |
+
"model": {
|
67 |
+
// Settings for transformer
|
68 |
+
"transformer": {
|
69 |
+
"encoder_layer": 4,
|
70 |
+
"encoder_head": 2,
|
71 |
+
"encoder_hidden": 256,
|
72 |
+
"decoder_layer": 6,
|
73 |
+
"decoder_head": 2,
|
74 |
+
"decoder_hidden": 256,
|
75 |
+
"conv_filter_size": 1024,
|
76 |
+
"conv_kernel_size": [9, 1],
|
77 |
+
"encoder_dropout": 0.2,
|
78 |
+
"decoder_dropout": 0.2
|
79 |
+
},
|
80 |
+
|
81 |
+
// Settings for variance_predictor
|
82 |
+
"variance_predictor":{
|
83 |
+
"filter_size": 256,
|
84 |
+
"kernel_size": 3,
|
85 |
+
"dropout": 0.5
|
86 |
+
},
|
87 |
+
"variance_embedding":{
|
88 |
+
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
|
89 |
+
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
|
90 |
+
"n_bins": 256
|
91 |
+
},
|
92 |
+
"max_seq_len": 1000
|
93 |
+
},
|
94 |
+
"train":{
|
95 |
+
"batch_size": 16,
|
96 |
+
"max_epoch": 100,
|
97 |
+
"sort_sample": true,
|
98 |
+
"drop_last": true,
|
99 |
+
"group_size": 4,
|
100 |
+
"grad_clip_thresh": 1.0,
|
101 |
+
"dataloader": {
|
102 |
+
"num_worker": 8,
|
103 |
+
"pin_memory": true
|
104 |
+
},
|
105 |
+
"lr_scheduler":{
|
106 |
+
"num_warmup": 4000
|
107 |
+
},
|
108 |
+
// LR Scheduler
|
109 |
+
"scheduler": "NoamLR",
|
110 |
+
// Optimizer
|
111 |
+
"optimizer": "Adam",
|
112 |
+
"adam": {
|
113 |
+
"lr": 0.0625,
|
114 |
+
"betas": [0.9, 0.98],
|
115 |
+
"eps": 0.000000001,
|
116 |
+
"weight_decay": 0.0
|
117 |
+
},
|
118 |
+
}
|
119 |
+
|
120 |
+
}
|
config/jets.json
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "Jets",
|
4 |
+
"task_type": "tts",
|
5 |
+
"dataset": ["LJSpeech"],
|
6 |
+
"preprocess": {
|
7 |
+
// acoustic features
|
8 |
+
"extract_audio": true,
|
9 |
+
"extract_mel": true,
|
10 |
+
"mel_extract_mode": "taco",
|
11 |
+
"mel_min_max_norm": false,
|
12 |
+
"extract_pitch": true,
|
13 |
+
"extract_uv": false,
|
14 |
+
"pitch_extractor": "dio",
|
15 |
+
"extract_energy": true,
|
16 |
+
"energy_extract_mode": "from_tacotron_stft",
|
17 |
+
"extract_duration": true,
|
18 |
+
"use_phone": false,
|
19 |
+
"pitch_norm": true,
|
20 |
+
"energy_norm": true,
|
21 |
+
"pitch_remove_outlier": true,
|
22 |
+
"energy_remove_outlier": true,
|
23 |
+
|
24 |
+
// Default config
|
25 |
+
"n_mel": 80,
|
26 |
+
"win_size": 1024, // todo
|
27 |
+
"hop_size": 256,
|
28 |
+
"sample_rate": 22050,
|
29 |
+
"n_fft": 1024, // todo
|
30 |
+
"fmin": 0,
|
31 |
+
"fmax": 8000, // todo
|
32 |
+
"raw_data": "raw_data",
|
33 |
+
"text_cleaners": ["english_cleaners"],
|
34 |
+
"f0_min": 71, // ~C2
|
35 |
+
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
|
36 |
+
"pitch_bin": 256,
|
37 |
+
"pitch_max": 1100.0,
|
38 |
+
"pitch_min": 50.0,
|
39 |
+
"is_label": true,
|
40 |
+
"is_mu_law": true,
|
41 |
+
"bits": 8,
|
42 |
+
|
43 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
44 |
+
"whisper_dir": "whisper",
|
45 |
+
"content_vector_dir": "content_vector",
|
46 |
+
"wenet_dir": "wenet",
|
47 |
+
"mert_dir": "mert",
|
48 |
+
"spk2id":"spk2id.json",
|
49 |
+
"utt2spk":"utt2spk",
|
50 |
+
"valid_file": "test.json",
|
51 |
+
|
52 |
+
// Features used for model training
|
53 |
+
"use_mel": true,
|
54 |
+
"use_min_max_norm_mel": false,
|
55 |
+
"use_frame_pitch": true,
|
56 |
+
"use_frame_energy": true,
|
57 |
+
"use_phone_pitch": false,
|
58 |
+
"use_phone_energy": false,
|
59 |
+
"use_log_scale_pitch": false,
|
60 |
+
"use_log_scale_energy": false,
|
61 |
+
"use_spkid": false,
|
62 |
+
"align_mel_duration": true,
|
63 |
+
"text_cleaners": ["english_cleaners"],
|
64 |
+
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
65 |
+
},
|
66 |
+
"model": {
|
67 |
+
// Settings for transformer
|
68 |
+
"transformer": {
|
69 |
+
"encoder_layer": 4,
|
70 |
+
"encoder_head": 2,
|
71 |
+
"encoder_hidden": 256,
|
72 |
+
"decoder_layer": 6,
|
73 |
+
"decoder_head": 2,
|
74 |
+
"decoder_hidden": 256,
|
75 |
+
"conv_filter_size": 1024,
|
76 |
+
"conv_kernel_size": [9, 1],
|
77 |
+
"encoder_dropout": 0.2,
|
78 |
+
"decoder_dropout": 0.2
|
79 |
+
},
|
80 |
+
|
81 |
+
// Settings for variance_predictor
|
82 |
+
"variance_predictor":{
|
83 |
+
"filter_size": 256,
|
84 |
+
"kernel_size": 3,
|
85 |
+
"dropout": 0.5
|
86 |
+
},
|
87 |
+
"variance_embedding":{
|
88 |
+
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
|
89 |
+
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
|
90 |
+
"n_bins": 256
|
91 |
+
},
|
92 |
+
"max_seq_len": 1000
|
93 |
+
},
|
94 |
+
"train":{
|
95 |
+
"batch_size": 16,
|
96 |
+
"max_epoch": 100,
|
97 |
+
"sort_sample": true,
|
98 |
+
"drop_last": true,
|
99 |
+
"group_size": 4,
|
100 |
+
"grad_clip_thresh": 1.0,
|
101 |
+
"dataloader": {
|
102 |
+
"num_worker": 8,
|
103 |
+
"pin_memory": true
|
104 |
+
},
|
105 |
+
"lr_scheduler":{
|
106 |
+
"num_warmup": 4000
|
107 |
+
},
|
108 |
+
// LR Scheduler
|
109 |
+
"scheduler": "NoamLR",
|
110 |
+
// Optimizer
|
111 |
+
"optimizer": "Adam",
|
112 |
+
"adam": {
|
113 |
+
"lr": 0.0625,
|
114 |
+
"betas": [0.9, 0.98],
|
115 |
+
"eps": 0.000000001,
|
116 |
+
"weight_decay": 0.0
|
117 |
+
},
|
118 |
+
}
|
119 |
+
|
120 |
+
}
|
config/ns2.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "NaturalSpeech2",
|
4 |
+
"dataset": ["libritts"],
|
5 |
+
"preprocess": {
|
6 |
+
"use_mel": false,
|
7 |
+
"use_code": true,
|
8 |
+
"use_spkid": true,
|
9 |
+
"use_pitch": true,
|
10 |
+
"use_duration": true,
|
11 |
+
"use_phone": true,
|
12 |
+
"use_len": true,
|
13 |
+
"use_cross_reference": true,
|
14 |
+
"train_file": "train.json",
|
15 |
+
"melspec_dir": "mel",
|
16 |
+
"code_dir": "code",
|
17 |
+
"pitch_dir": "pitch",
|
18 |
+
"duration_dir": "duration",
|
19 |
+
"clip_mode": "start"
|
20 |
+
},
|
21 |
+
"model": {
|
22 |
+
"latent_dim": 128,
|
23 |
+
"prior_encoder": {
|
24 |
+
"vocab_size": 100,
|
25 |
+
"pitch_min": 50,
|
26 |
+
"pitch_max": 1100,
|
27 |
+
"pitch_bins_num": 512,
|
28 |
+
"encoder": {
|
29 |
+
"encoder_layer": 6,
|
30 |
+
"encoder_hidden": 512,
|
31 |
+
"encoder_head": 8,
|
32 |
+
"conv_filter_size": 2048,
|
33 |
+
"conv_kernel_size": 9,
|
34 |
+
"encoder_dropout": 0.2,
|
35 |
+
"use_cln": true
|
36 |
+
},
|
37 |
+
"duration_predictor": {
|
38 |
+
"input_size": 512,
|
39 |
+
"filter_size": 512,
|
40 |
+
"kernel_size": 3,
|
41 |
+
"conv_layers": 30,
|
42 |
+
"cross_attn_per_layer": 3,
|
43 |
+
"attn_head": 8,
|
44 |
+
"drop_out": 0.5
|
45 |
+
},
|
46 |
+
"pitch_predictor": {
|
47 |
+
"input_size": 512,
|
48 |
+
"filter_size": 512,
|
49 |
+
"kernel_size": 5,
|
50 |
+
"conv_layers": 30,
|
51 |
+
"cross_attn_per_layer": 3,
|
52 |
+
"attn_head": 8,
|
53 |
+
"drop_out": 0.5
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"diffusion": {
|
57 |
+
"wavenet": {
|
58 |
+
"input_size": 128,
|
59 |
+
"hidden_size": 512,
|
60 |
+
"out_size": 128,
|
61 |
+
"num_layers": 40,
|
62 |
+
"cross_attn_per_layer": 3,
|
63 |
+
"dilation_cycle": 2,
|
64 |
+
"attn_head": 8,
|
65 |
+
"drop_out": 0.2
|
66 |
+
},
|
67 |
+
"beta_min": 0.05,
|
68 |
+
"beta_max": 20,
|
69 |
+
"sigma": 1.0,
|
70 |
+
"noise_factor": 1.0,
|
71 |
+
"ode_solver": "euler"
|
72 |
+
},
|
73 |
+
"prompt_encoder": {
|
74 |
+
"encoder_layer": 6,
|
75 |
+
"encoder_hidden": 512,
|
76 |
+
"encoder_head": 8,
|
77 |
+
"conv_filter_size": 2048,
|
78 |
+
"conv_kernel_size": 9,
|
79 |
+
"encoder_dropout": 0.2,
|
80 |
+
"use_cln": false
|
81 |
+
},
|
82 |
+
"query_emb": {
|
83 |
+
"query_token_num": 32,
|
84 |
+
"hidden_size": 512,
|
85 |
+
"head_num": 8
|
86 |
+
}
|
87 |
+
}
|
88 |
+
}
|
config/svc/base.json
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"task_type": "svc",
|
4 |
+
"preprocess": {
|
5 |
+
// data augmentations
|
6 |
+
"use_pitch_shift": false,
|
7 |
+
"use_formant_shift": false,
|
8 |
+
"use_time_stretch": false,
|
9 |
+
"use_equalizer": false,
|
10 |
+
// Online or offline features extraction ("offline" or "online")
|
11 |
+
"features_extraction_mode": "offline",
|
12 |
+
// acoustic features
|
13 |
+
"extract_mel": true,
|
14 |
+
"mel_min_max_norm": true,
|
15 |
+
"extract_pitch": true,
|
16 |
+
"pitch_extractor": "parselmouth",
|
17 |
+
"extract_uv": true,
|
18 |
+
"extract_energy": true,
|
19 |
+
// content features
|
20 |
+
"extract_whisper_feature": false,
|
21 |
+
"whisper_sample_rate": 16000,
|
22 |
+
"extract_contentvec_feature": false,
|
23 |
+
"contentvec_sample_rate": 16000,
|
24 |
+
"extract_wenet_feature": false,
|
25 |
+
"wenet_sample_rate": 16000,
|
26 |
+
"extract_mert_feature": false,
|
27 |
+
"mert_sample_rate": 16000,
|
28 |
+
// Default config for whisper
|
29 |
+
"whisper_frameshift": 0.01,
|
30 |
+
"whisper_downsample_rate": 2,
|
31 |
+
// Default config for content vector
|
32 |
+
"contentvec_frameshift": 0.02,
|
33 |
+
// Default config for mert
|
34 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
35 |
+
"mert_feature_layer": -1,
|
36 |
+
"mert_hop_size": 320,
|
37 |
+
// 24k
|
38 |
+
"mert_frameshit": 0.01333,
|
39 |
+
// 10ms
|
40 |
+
"wenet_frameshift": 0.01,
|
41 |
+
// wenetspeech is 4, gigaspeech is 6
|
42 |
+
"wenet_downsample_rate": 4,
|
43 |
+
// Default config
|
44 |
+
"n_mel": 100,
|
45 |
+
"win_size": 1024,
|
46 |
+
// todo
|
47 |
+
"hop_size": 256,
|
48 |
+
"sample_rate": 24000,
|
49 |
+
"n_fft": 1024,
|
50 |
+
// todo
|
51 |
+
"fmin": 0,
|
52 |
+
"fmax": 12000,
|
53 |
+
// todo
|
54 |
+
"f0_min": 50,
|
55 |
+
// ~C2
|
56 |
+
"f0_max": 1100,
|
57 |
+
//1100, // ~C6(1100), ~G5(800)
|
58 |
+
"pitch_bin": 256,
|
59 |
+
"pitch_max": 1100.0,
|
60 |
+
"pitch_min": 50.0,
|
61 |
+
"is_label": true,
|
62 |
+
"is_mu_law": true,
|
63 |
+
"bits": 8,
|
64 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
65 |
+
"whisper_dir": "whisper",
|
66 |
+
"contentvec_dir": "contentvec",
|
67 |
+
"wenet_dir": "wenet",
|
68 |
+
"mert_dir": "mert",
|
69 |
+
// Extract content features using dataloader
|
70 |
+
"pin_memory": true,
|
71 |
+
"num_workers": 8,
|
72 |
+
"content_feature_batch_size": 16,
|
73 |
+
// Features used for model training
|
74 |
+
"use_mel": true,
|
75 |
+
"use_min_max_norm_mel": true,
|
76 |
+
"use_frame_pitch": true,
|
77 |
+
"use_uv": true,
|
78 |
+
"use_interpolation_for_uv": false,
|
79 |
+
"use_frame_energy": true,
|
80 |
+
"use_log_scale_pitch": false,
|
81 |
+
"use_log_scale_energy": false,
|
82 |
+
"use_spkid": true,
|
83 |
+
// Meta file
|
84 |
+
"train_file": "train.json",
|
85 |
+
"valid_file": "test.json",
|
86 |
+
"spk2id": "singers.json",
|
87 |
+
"utt2spk": "utt2singer"
|
88 |
+
},
|
89 |
+
"model": {
|
90 |
+
"condition_encoder": {
|
91 |
+
"merge_mode": "add",
|
92 |
+
// Prosody Features
|
93 |
+
"use_f0": true,
|
94 |
+
"use_uv": true,
|
95 |
+
"use_energy": true,
|
96 |
+
// Quantization (0 for not quantization)
|
97 |
+
"input_melody_dim": 1,
|
98 |
+
"n_bins_melody": 256,
|
99 |
+
"output_melody_dim": 384,
|
100 |
+
"input_loudness_dim": 1,
|
101 |
+
"n_bins_loudness": 256,
|
102 |
+
"output_loudness_dim": 384,
|
103 |
+
// Semantic Features
|
104 |
+
"use_whisper": false,
|
105 |
+
"use_contentvec": false,
|
106 |
+
"use_wenet": false,
|
107 |
+
"use_mert": false,
|
108 |
+
"whisper_dim": 1024,
|
109 |
+
"contentvec_dim": 256,
|
110 |
+
"mert_dim": 256,
|
111 |
+
"wenet_dim": 512,
|
112 |
+
"content_encoder_dim": 384,
|
113 |
+
// Speaker Features
|
114 |
+
"output_singer_dim": 384,
|
115 |
+
"singer_table_size": 512,
|
116 |
+
"use_spkid": true
|
117 |
+
}
|
118 |
+
},
|
119 |
+
}
|
config/svc/diffusion.json
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/svc/base.json",
|
3 |
+
"model": {
|
4 |
+
"condition_encoder": {
|
5 |
+
"merge_mode": "add",
|
6 |
+
// Prosody Features
|
7 |
+
"use_f0": true,
|
8 |
+
"use_uv": true,
|
9 |
+
"use_energy": true,
|
10 |
+
// Quantization (0 for not quantization)
|
11 |
+
"input_melody_dim": 1,
|
12 |
+
"n_bins_melody": 256,
|
13 |
+
"output_melody_dim": 384,
|
14 |
+
"input_loudness_dim": 1,
|
15 |
+
"n_bins_loudness": 256,
|
16 |
+
"output_loudness_dim": 384,
|
17 |
+
// Semantic Features
|
18 |
+
"use_whisper": false,
|
19 |
+
"use_contentvec": false,
|
20 |
+
"use_wenet": false,
|
21 |
+
"use_mert": false,
|
22 |
+
"whisper_dim": 1024,
|
23 |
+
"contentvec_dim": 256,
|
24 |
+
"mert_dim": 256,
|
25 |
+
"wenet_dim": 512,
|
26 |
+
"content_encoder_dim": 384,
|
27 |
+
// Speaker Features
|
28 |
+
"output_singer_dim": 384,
|
29 |
+
"singer_table_size": 512,
|
30 |
+
"use_spkid": true
|
31 |
+
},
|
32 |
+
"diffusion": {
|
33 |
+
"scheduler": "ddpm",
|
34 |
+
"scheduler_settings": {
|
35 |
+
"num_train_timesteps": 1000,
|
36 |
+
"beta_start": 1.0e-4,
|
37 |
+
"beta_end": 0.02,
|
38 |
+
"beta_schedule": "linear"
|
39 |
+
},
|
40 |
+
// Diffusion steps encoder
|
41 |
+
"step_encoder": {
|
42 |
+
"dim_raw_embedding": 128,
|
43 |
+
"dim_hidden_layer": 512,
|
44 |
+
"activation": "SiLU",
|
45 |
+
"num_layer": 2,
|
46 |
+
"max_period": 10000
|
47 |
+
},
|
48 |
+
// Diffusion decoder
|
49 |
+
"model_type": "bidilconv",
|
50 |
+
// bidilconv, unet2d, TODO: unet1d
|
51 |
+
"bidilconv": {
|
52 |
+
"base_channel": 384,
|
53 |
+
"n_res_block": 20,
|
54 |
+
"conv_kernel_size": 3,
|
55 |
+
"dilation_cycle_length": 4,
|
56 |
+
// specially, 1 means no dilation
|
57 |
+
"conditioner_size": 384
|
58 |
+
},
|
59 |
+
"unet2d": {
|
60 |
+
"in_channels": 1,
|
61 |
+
"out_channels": 1,
|
62 |
+
"down_block_types": [
|
63 |
+
"CrossAttnDownBlock2D",
|
64 |
+
"CrossAttnDownBlock2D",
|
65 |
+
"CrossAttnDownBlock2D",
|
66 |
+
"DownBlock2D"
|
67 |
+
],
|
68 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
69 |
+
"up_block_types": [
|
70 |
+
"UpBlock2D",
|
71 |
+
"CrossAttnUpBlock2D",
|
72 |
+
"CrossAttnUpBlock2D",
|
73 |
+
"CrossAttnUpBlock2D"
|
74 |
+
],
|
75 |
+
"only_cross_attention": false
|
76 |
+
}
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"train": {
|
80 |
+
// Basic settings
|
81 |
+
"batch_size": 64,
|
82 |
+
"gradient_accumulation_step": 1,
|
83 |
+
"max_epoch": -1,
|
84 |
+
// -1 means no limit
|
85 |
+
"save_checkpoint_stride": [
|
86 |
+
5,
|
87 |
+
20
|
88 |
+
],
|
89 |
+
// unit is epoch
|
90 |
+
"keep_last": [
|
91 |
+
3,
|
92 |
+
-1
|
93 |
+
],
|
94 |
+
// -1 means infinite, if one number will broadcast
|
95 |
+
"run_eval": [
|
96 |
+
false,
|
97 |
+
true
|
98 |
+
],
|
99 |
+
// if one number will broadcast
|
100 |
+
// Fix the random seed
|
101 |
+
"random_seed": 10086,
|
102 |
+
// Batchsampler
|
103 |
+
"sampler": {
|
104 |
+
"holistic_shuffle": true,
|
105 |
+
"drop_last": true
|
106 |
+
},
|
107 |
+
// Dataloader
|
108 |
+
"dataloader": {
|
109 |
+
"num_worker": 32,
|
110 |
+
"pin_memory": true
|
111 |
+
},
|
112 |
+
// Trackers
|
113 |
+
"tracker": [
|
114 |
+
"tensorboard"
|
115 |
+
// "wandb",
|
116 |
+
// "cometml",
|
117 |
+
// "mlflow",
|
118 |
+
],
|
119 |
+
// Optimizer
|
120 |
+
"optimizer": "AdamW",
|
121 |
+
"adamw": {
|
122 |
+
"lr": 4.0e-4
|
123 |
+
// nn model lr
|
124 |
+
},
|
125 |
+
// LR Scheduler
|
126 |
+
"scheduler": "ReduceLROnPlateau",
|
127 |
+
"reducelronplateau": {
|
128 |
+
"factor": 0.8,
|
129 |
+
"patience": 10,
|
130 |
+
// unit is epoch
|
131 |
+
"min_lr": 1.0e-4
|
132 |
+
}
|
133 |
+
},
|
134 |
+
"inference": {
|
135 |
+
"diffusion": {
|
136 |
+
"scheduler": "pndm",
|
137 |
+
"scheduler_settings": {
|
138 |
+
"num_inference_timesteps": 1000
|
139 |
+
}
|
140 |
+
}
|
141 |
+
}
|
142 |
+
}
|
config/transformer.json
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/svc/base.json",
|
3 |
+
"model_type": "Transformer",
|
4 |
+
"task_type": "svc",
|
5 |
+
"preprocess": {
|
6 |
+
// data augmentations
|
7 |
+
"use_pitch_shift": false,
|
8 |
+
"use_formant_shift": false,
|
9 |
+
"use_time_stretch": false,
|
10 |
+
"use_equalizer": false,
|
11 |
+
// acoustic features
|
12 |
+
"extract_mel": true,
|
13 |
+
"mel_min_max_norm": true,
|
14 |
+
"extract_pitch": true,
|
15 |
+
"pitch_extractor": "parselmouth",
|
16 |
+
"extract_uv": true,
|
17 |
+
"extract_energy": true,
|
18 |
+
// content features
|
19 |
+
"extract_whisper_feature": false,
|
20 |
+
"whisper_sample_rate": 16000,
|
21 |
+
"extract_contentvec_feature": false,
|
22 |
+
"contentvec_sample_rate": 16000,
|
23 |
+
"extract_wenet_feature": false,
|
24 |
+
"wenet_sample_rate": 16000,
|
25 |
+
"extract_mert_feature": false,
|
26 |
+
"mert_sample_rate": 16000,
|
27 |
+
// Default config for whisper
|
28 |
+
"whisper_frameshift": 0.01,
|
29 |
+
"whisper_downsample_rate": 2,
|
30 |
+
// Default config for content vector
|
31 |
+
"contentvec_frameshift": 0.02,
|
32 |
+
// Default config for mert
|
33 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
34 |
+
"mert_feature_layer": -1,
|
35 |
+
"mert_hop_size": 320,
|
36 |
+
// 24k
|
37 |
+
"mert_frameshit": 0.01333,
|
38 |
+
// 10ms
|
39 |
+
"wenet_frameshift": 0.01,
|
40 |
+
// wenetspeech is 4, gigaspeech is 6
|
41 |
+
"wenet_downsample_rate": 4,
|
42 |
+
// Default config
|
43 |
+
"n_mel": 100,
|
44 |
+
"win_size": 1024,
|
45 |
+
// todo
|
46 |
+
"hop_size": 256,
|
47 |
+
"sample_rate": 24000,
|
48 |
+
"n_fft": 1024,
|
49 |
+
// todo
|
50 |
+
"fmin": 0,
|
51 |
+
"fmax": 12000,
|
52 |
+
// todo
|
53 |
+
"f0_min": 50,
|
54 |
+
// ~C2
|
55 |
+
"f0_max": 1100,
|
56 |
+
//1100, // ~C6(1100), ~G5(800)
|
57 |
+
"pitch_bin": 256,
|
58 |
+
"pitch_max": 1100.0,
|
59 |
+
"pitch_min": 50.0,
|
60 |
+
"is_label": true,
|
61 |
+
"is_mu_law": true,
|
62 |
+
"bits": 8,
|
63 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
64 |
+
"whisper_dir": "whisper",
|
65 |
+
"contentvec_dir": "contentvec",
|
66 |
+
"wenet_dir": "wenet",
|
67 |
+
"mert_dir": "mert",
|
68 |
+
// Extract content features using dataloader
|
69 |
+
"pin_memory": true,
|
70 |
+
"num_workers": 8,
|
71 |
+
"content_feature_batch_size": 16,
|
72 |
+
// Features used for model training
|
73 |
+
"use_mel": true,
|
74 |
+
"use_min_max_norm_mel": true,
|
75 |
+
"use_frame_pitch": true,
|
76 |
+
"use_uv": true,
|
77 |
+
"use_frame_energy": true,
|
78 |
+
"use_log_scale_pitch": false,
|
79 |
+
"use_log_scale_energy": false,
|
80 |
+
"use_spkid": true,
|
81 |
+
// Meta file
|
82 |
+
"train_file": "train.json",
|
83 |
+
"valid_file": "test.json",
|
84 |
+
"spk2id": "singers.json",
|
85 |
+
"utt2spk": "utt2singer"
|
86 |
+
},
|
87 |
+
"model": {
|
88 |
+
"condition_encoder": {
|
89 |
+
"merge_mode": "add",
|
90 |
+
"input_melody_dim": 1,
|
91 |
+
"use_log_f0": true,
|
92 |
+
"n_bins_melody": 256,
|
93 |
+
//# Quantization (0 for not quantization)
|
94 |
+
"output_melody_dim": 384,
|
95 |
+
"input_loudness_dim": 1,
|
96 |
+
"use_log_loudness": true,
|
97 |
+
"n_bins_loudness": 256,
|
98 |
+
"output_loudness_dim": 384,
|
99 |
+
"use_whisper": false,
|
100 |
+
"use_contentvec": true,
|
101 |
+
"use_wenet": false,
|
102 |
+
"use_mert": false,
|
103 |
+
"whisper_dim": 1024,
|
104 |
+
"contentvec_dim": 256,
|
105 |
+
"mert_dim": 256,
|
106 |
+
"wenet_dim": 512,
|
107 |
+
"content_encoder_dim": 384,
|
108 |
+
"output_singer_dim": 384,
|
109 |
+
"singer_table_size": 512,
|
110 |
+
"output_content_dim": 384,
|
111 |
+
"use_spkid": true
|
112 |
+
},
|
113 |
+
"transformer": {
|
114 |
+
"type": "conformer",
|
115 |
+
// 'conformer' or 'transformer'
|
116 |
+
"input_dim": 384,
|
117 |
+
"output_dim": 100,
|
118 |
+
"n_heads": 2,
|
119 |
+
"n_layers": 6,
|
120 |
+
"filter_channels": 512,
|
121 |
+
"dropout": 0.1,
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"train": {
|
125 |
+
// Basic settings
|
126 |
+
"batch_size": 64,
|
127 |
+
"gradient_accumulation_step": 1,
|
128 |
+
"max_epoch": -1,
|
129 |
+
// -1 means no limit
|
130 |
+
"save_checkpoint_stride": [
|
131 |
+
10,
|
132 |
+
100
|
133 |
+
],
|
134 |
+
// unit is epoch
|
135 |
+
"keep_last": [
|
136 |
+
3,
|
137 |
+
-1
|
138 |
+
],
|
139 |
+
// -1 means infinite, if one number will broadcast
|
140 |
+
"run_eval": [
|
141 |
+
false,
|
142 |
+
true
|
143 |
+
],
|
144 |
+
// if one number will broadcast
|
145 |
+
// Fix the random seed
|
146 |
+
"random_seed": 10086,
|
147 |
+
// Batchsampler
|
148 |
+
"sampler": {
|
149 |
+
"holistic_shuffle": true,
|
150 |
+
"drop_last": true
|
151 |
+
},
|
152 |
+
// Dataloader
|
153 |
+
"dataloader": {
|
154 |
+
"num_worker": 32,
|
155 |
+
"pin_memory": true
|
156 |
+
},
|
157 |
+
// Trackers
|
158 |
+
"tracker": [
|
159 |
+
"tensorboard"
|
160 |
+
// "wandb",
|
161 |
+
// "cometml",
|
162 |
+
// "mlflow",
|
163 |
+
],
|
164 |
+
// Optimizer
|
165 |
+
"optimizer": "AdamW",
|
166 |
+
"adamw": {
|
167 |
+
"lr": 4.0e-4
|
168 |
+
// nn model lr
|
169 |
+
},
|
170 |
+
// LR Scheduler
|
171 |
+
"scheduler": "ReduceLROnPlateau",
|
172 |
+
"reducelronplateau": {
|
173 |
+
"factor": 0.8,
|
174 |
+
"patience": 10,
|
175 |
+
// unit is epoch
|
176 |
+
"min_lr": 1.0e-4
|
177 |
+
}
|
178 |
+
}
|
179 |
+
}
|
config/tts.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"supported_model_type": [
|
4 |
+
"Fastspeech2",
|
5 |
+
"VITS",
|
6 |
+
"VALLE",
|
7 |
+
"NaturalSpeech2"
|
8 |
+
],
|
9 |
+
"task_type": "tts",
|
10 |
+
"preprocess": {
|
11 |
+
"language": "en-us", // espeak supports 100 languages https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
|
12 |
+
// linguistic features
|
13 |
+
"extract_phone": true,
|
14 |
+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
15 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
16 |
+
// Directory names of processed data or extracted features
|
17 |
+
"phone_dir": "phones",
|
18 |
+
"use_phone": true,
|
19 |
+
"add_blank": true
|
20 |
+
},
|
21 |
+
"model": {
|
22 |
+
"text_token_num": 512,
|
23 |
+
}
|
24 |
+
|
25 |
+
}
|
config/valle.json
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "VALLE",
|
4 |
+
"task_type": "tts",
|
5 |
+
"dataset": [
|
6 |
+
"libritts"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
"extract_phone": true,
|
10 |
+
"phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
|
11 |
+
"extract_acoustic_token": true,
|
12 |
+
"acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
|
13 |
+
"acoustic_token_dir": "acoutic_tokens",
|
14 |
+
"use_text": false,
|
15 |
+
"use_phone": true,
|
16 |
+
"use_acoustic_token": true,
|
17 |
+
"symbols_dict": "symbols.dict",
|
18 |
+
"min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
|
19 |
+
"max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
|
20 |
+
"sample_rate": 24000,
|
21 |
+
"codec_hop_size": 320
|
22 |
+
},
|
23 |
+
"model": {
|
24 |
+
"text_token_num": 512,
|
25 |
+
"audio_token_num": 1024,
|
26 |
+
"decoder_dim": 1024, // embedding dimension of the decoder model
|
27 |
+
"nhead": 16, // number of attention heads in the decoder layers
|
28 |
+
"num_decoder_layers": 12, // number of decoder layers
|
29 |
+
"norm_first": true, // pre or post Normalization.
|
30 |
+
"add_prenet": false, // whether add PreNet after Inputs
|
31 |
+
"prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
|
32 |
+
"share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
|
33 |
+
"nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
|
34 |
+
"prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
|
35 |
+
"num_quantizers": 8, // numbert of the audio quantization layers
|
36 |
+
// "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
|
37 |
+
},
|
38 |
+
"train": {
|
39 |
+
"use_dynamic_batchsize": false, // If use dynamic batch size
|
40 |
+
"ddp": false,
|
41 |
+
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
|
42 |
+
"max_epoch": 20,
|
43 |
+
"optimizer": "AdamW",
|
44 |
+
"scheduler": "cosine",
|
45 |
+
"warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
|
46 |
+
"total_training_steps": 800000,
|
47 |
+
"base_lr": 1e-4, // base learning rate."
|
48 |
+
"valid_interval": 1000,
|
49 |
+
"log_epoch_step": 1000,
|
50 |
+
"save_checkpoint_stride": [
|
51 |
+
1,
|
52 |
+
1
|
53 |
+
]
|
54 |
+
}
|
55 |
+
}
|
config/vits.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "VITS",
|
4 |
+
"task_type": "tts",
|
5 |
+
"preprocess": {
|
6 |
+
"extract_phone": true,
|
7 |
+
"extract_mel": true,
|
8 |
+
"n_mel": 80,
|
9 |
+
"fmin": 0,
|
10 |
+
"fmax": null,
|
11 |
+
"extract_linear_spec": true,
|
12 |
+
"extract_audio": true,
|
13 |
+
"use_linear": true,
|
14 |
+
"use_mel": true,
|
15 |
+
"use_audio": true,
|
16 |
+
"use_text": false,
|
17 |
+
"use_phone": true,
|
18 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
19 |
+
"n_fft": 1024,
|
20 |
+
"win_size": 1024,
|
21 |
+
"hop_size": 256,
|
22 |
+
"segment_size": 8192,
|
23 |
+
"text_cleaners": [
|
24 |
+
"english_cleaners"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
"model": {
|
28 |
+
"text_token_num": 512,
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"n_heads": 2,
|
33 |
+
"n_layers": 6,
|
34 |
+
"kernel_size": 3,
|
35 |
+
"p_dropout": 0.1,
|
36 |
+
"resblock": "1",
|
37 |
+
"resblock_kernel_sizes": [
|
38 |
+
3,
|
39 |
+
7,
|
40 |
+
11
|
41 |
+
],
|
42 |
+
"resblock_dilation_sizes": [
|
43 |
+
[
|
44 |
+
1,
|
45 |
+
3,
|
46 |
+
5
|
47 |
+
],
|
48 |
+
[
|
49 |
+
1,
|
50 |
+
3,
|
51 |
+
5
|
52 |
+
],
|
53 |
+
[
|
54 |
+
1,
|
55 |
+
3,
|
56 |
+
5
|
57 |
+
]
|
58 |
+
],
|
59 |
+
"upsample_rates": [
|
60 |
+
8,
|
61 |
+
8,
|
62 |
+
2,
|
63 |
+
2
|
64 |
+
],
|
65 |
+
"upsample_initial_channel": 512,
|
66 |
+
"upsample_kernel_sizes": [
|
67 |
+
16,
|
68 |
+
16,
|
69 |
+
4,
|
70 |
+
4
|
71 |
+
],
|
72 |
+
"n_layers_q": 3,
|
73 |
+
"use_spectral_norm": false,
|
74 |
+
"n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
|
75 |
+
"gin_channels": 256,
|
76 |
+
"use_sdp": true
|
77 |
+
},
|
78 |
+
"train": {
|
79 |
+
"fp16_run": true,
|
80 |
+
"learning_rate": 2e-4,
|
81 |
+
"betas": [
|
82 |
+
0.8,
|
83 |
+
0.99
|
84 |
+
],
|
85 |
+
"eps": 1e-9,
|
86 |
+
"batch_size": 16,
|
87 |
+
"lr_decay": 0.999875,
|
88 |
+
// "segment_size": 8192,
|
89 |
+
"init_lr_ratio": 1,
|
90 |
+
"warmup_epochs": 0,
|
91 |
+
"c_mel": 45,
|
92 |
+
"c_kl": 1.0,
|
93 |
+
"AdamW": {
|
94 |
+
"betas": [
|
95 |
+
0.8,
|
96 |
+
0.99
|
97 |
+
],
|
98 |
+
"eps": 1e-9,
|
99 |
+
}
|
100 |
+
}
|
101 |
+
}
|
config/vitssvc.json
ADDED
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/svc/base.json",
|
3 |
+
"model_type": "VITS",
|
4 |
+
"task_type": "svc",
|
5 |
+
"preprocess": {
|
6 |
+
// Config for features extraction
|
7 |
+
"extract_mel": true,
|
8 |
+
"extract_pitch": true,
|
9 |
+
"pitch_extractor": "parselmouth",
|
10 |
+
"extract_energy": true,
|
11 |
+
"extract_uv": true,
|
12 |
+
"extract_linear_spec": true,
|
13 |
+
"extract_audio": true,
|
14 |
+
"mel_min_max_norm": true,
|
15 |
+
// Config for features usage
|
16 |
+
"use_linear": true,
|
17 |
+
"use_mel": true,
|
18 |
+
"use_min_max_norm_mel": false,
|
19 |
+
"use_audio": true,
|
20 |
+
"use_frame_pitch": true,
|
21 |
+
"use_uv": true,
|
22 |
+
"use_spkid": true,
|
23 |
+
"use_contentvec": false,
|
24 |
+
"use_whisper": false,
|
25 |
+
"use_wenet": false,
|
26 |
+
"use_text": false,
|
27 |
+
"use_phone": false,
|
28 |
+
"fmin": 0,
|
29 |
+
"fmax": 12000,
|
30 |
+
"f0_min": 50,
|
31 |
+
"f0_max": 1100,
|
32 |
+
// f0_bin in sovits
|
33 |
+
"pitch_bin": 256,
|
34 |
+
// filter_length in sovits
|
35 |
+
"n_fft": 1024,
|
36 |
+
// hop_length in sovits
|
37 |
+
"hop_size": 256,
|
38 |
+
// win_length in sovits
|
39 |
+
"win_size": 1024,
|
40 |
+
"segment_size": 8192,
|
41 |
+
"n_mel": 100,
|
42 |
+
"sample_rate": 24000,
|
43 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
44 |
+
"whisper_dir": "whisper",
|
45 |
+
"contentvec_dir": "contentvec",
|
46 |
+
"wenet_dir": "wenet",
|
47 |
+
"mert_dir": "mert",
|
48 |
+
// Meta file
|
49 |
+
"train_file": "train.json",
|
50 |
+
"valid_file": "test.json",
|
51 |
+
"spk2id": "singers.json",
|
52 |
+
"utt2spk": "utt2singer"
|
53 |
+
},
|
54 |
+
"model": {
|
55 |
+
"condition_encoder": {
|
56 |
+
"merge_mode": "add",
|
57 |
+
"input_melody_dim": 1,
|
58 |
+
"use_log_f0": true,
|
59 |
+
"n_bins_melody": 256,
|
60 |
+
"output_melody_dim": 384,
|
61 |
+
"input_loudness_dim": 1,
|
62 |
+
"use_log_loudness": true,
|
63 |
+
"n_bins_loudness": 256,
|
64 |
+
"output_loudness_dim": 384,
|
65 |
+
"use_whisper": false,
|
66 |
+
"use_contentvec": false,
|
67 |
+
"use_wenet": false,
|
68 |
+
"use_mert": false,
|
69 |
+
"whisper_dim": 1024,
|
70 |
+
"contentvec_dim": 256,
|
71 |
+
"mert_dim": 256,
|
72 |
+
"wenet_dim": 512,
|
73 |
+
"content_encoder_dim": 384,
|
74 |
+
"singer_table_size": 512,
|
75 |
+
"output_singer_dim": 384,
|
76 |
+
"output_content_dim": 384,
|
77 |
+
"use_spkid": true,
|
78 |
+
"pitch_max": 1100.0,
|
79 |
+
"pitch_min": 50.0,
|
80 |
+
},
|
81 |
+
"vits": {
|
82 |
+
"filter_channels": 256,
|
83 |
+
"gin_channels": 256,
|
84 |
+
"hidden_channels": 384,
|
85 |
+
"inter_channels": 384,
|
86 |
+
"kernel_size": 3,
|
87 |
+
"n_flow_layer": 4,
|
88 |
+
"n_heads": 2,
|
89 |
+
"n_layers": 6,
|
90 |
+
"n_layers_q": 3,
|
91 |
+
"n_speakers": 512,
|
92 |
+
"p_dropout": 0.1,
|
93 |
+
"use_spectral_norm": false,
|
94 |
+
},
|
95 |
+
"generator": "hifigan",
|
96 |
+
"generator_config": {
|
97 |
+
"hifigan": {
|
98 |
+
"resblock": "1",
|
99 |
+
"resblock_kernel_sizes": [
|
100 |
+
3,
|
101 |
+
7,
|
102 |
+
11
|
103 |
+
],
|
104 |
+
"upsample_rates": [
|
105 |
+
8,
|
106 |
+
8,
|
107 |
+
2,
|
108 |
+
2
|
109 |
+
],
|
110 |
+
"upsample_kernel_sizes": [
|
111 |
+
16,
|
112 |
+
16,
|
113 |
+
4,
|
114 |
+
4
|
115 |
+
],
|
116 |
+
"upsample_initial_channel": 512,
|
117 |
+
"resblock_dilation_sizes": [
|
118 |
+
[
|
119 |
+
1,
|
120 |
+
3,
|
121 |
+
5
|
122 |
+
],
|
123 |
+
[
|
124 |
+
1,
|
125 |
+
3,
|
126 |
+
5
|
127 |
+
],
|
128 |
+
[
|
129 |
+
1,
|
130 |
+
3,
|
131 |
+
5
|
132 |
+
]
|
133 |
+
]
|
134 |
+
},
|
135 |
+
"melgan": {
|
136 |
+
"ratios": [
|
137 |
+
8,
|
138 |
+
8,
|
139 |
+
2,
|
140 |
+
2
|
141 |
+
],
|
142 |
+
"ngf": 32,
|
143 |
+
"n_residual_layers": 3,
|
144 |
+
"num_D": 3,
|
145 |
+
"ndf": 16,
|
146 |
+
"n_layers": 4,
|
147 |
+
"downsampling_factor": 4
|
148 |
+
},
|
149 |
+
"bigvgan": {
|
150 |
+
"resblock": "1",
|
151 |
+
"activation": "snakebeta",
|
152 |
+
"snake_logscale": true,
|
153 |
+
"upsample_rates": [
|
154 |
+
8,
|
155 |
+
8,
|
156 |
+
2,
|
157 |
+
2
|
158 |
+
],
|
159 |
+
"upsample_kernel_sizes": [
|
160 |
+
16,
|
161 |
+
16,
|
162 |
+
4,
|
163 |
+
4
|
164 |
+
],
|
165 |
+
"upsample_initial_channel": 512,
|
166 |
+
"resblock_kernel_sizes": [
|
167 |
+
3,
|
168 |
+
7,
|
169 |
+
11
|
170 |
+
],
|
171 |
+
"resblock_dilation_sizes": [
|
172 |
+
[
|
173 |
+
1,
|
174 |
+
3,
|
175 |
+
5
|
176 |
+
],
|
177 |
+
[
|
178 |
+
1,
|
179 |
+
3,
|
180 |
+
5
|
181 |
+
],
|
182 |
+
[
|
183 |
+
1,
|
184 |
+
3,
|
185 |
+
5
|
186 |
+
]
|
187 |
+
]
|
188 |
+
},
|
189 |
+
"nsfhifigan": {
|
190 |
+
"resblock": "1",
|
191 |
+
"harmonic_num": 8,
|
192 |
+
"upsample_rates": [
|
193 |
+
8,
|
194 |
+
8,
|
195 |
+
2,
|
196 |
+
2
|
197 |
+
],
|
198 |
+
"upsample_kernel_sizes": [
|
199 |
+
16,
|
200 |
+
16,
|
201 |
+
4,
|
202 |
+
4
|
203 |
+
],
|
204 |
+
"upsample_initial_channel": 768,
|
205 |
+
"resblock_kernel_sizes": [
|
206 |
+
3,
|
207 |
+
7,
|
208 |
+
11
|
209 |
+
],
|
210 |
+
"resblock_dilation_sizes": [
|
211 |
+
[
|
212 |
+
1,
|
213 |
+
3,
|
214 |
+
5
|
215 |
+
],
|
216 |
+
[
|
217 |
+
1,
|
218 |
+
3,
|
219 |
+
5
|
220 |
+
],
|
221 |
+
[
|
222 |
+
1,
|
223 |
+
3,
|
224 |
+
5
|
225 |
+
]
|
226 |
+
]
|
227 |
+
},
|
228 |
+
"apnet": {
|
229 |
+
"ASP_channel": 512,
|
230 |
+
"ASP_resblock_kernel_sizes": [
|
231 |
+
3,
|
232 |
+
7,
|
233 |
+
11
|
234 |
+
],
|
235 |
+
"ASP_resblock_dilation_sizes": [
|
236 |
+
[
|
237 |
+
1,
|
238 |
+
3,
|
239 |
+
5
|
240 |
+
],
|
241 |
+
[
|
242 |
+
1,
|
243 |
+
3,
|
244 |
+
5
|
245 |
+
],
|
246 |
+
[
|
247 |
+
1,
|
248 |
+
3,
|
249 |
+
5
|
250 |
+
]
|
251 |
+
],
|
252 |
+
"ASP_input_conv_kernel_size": 7,
|
253 |
+
"ASP_output_conv_kernel_size": 7,
|
254 |
+
"PSP_channel": 512,
|
255 |
+
"PSP_resblock_kernel_sizes": [
|
256 |
+
3,
|
257 |
+
7,
|
258 |
+
11
|
259 |
+
],
|
260 |
+
"PSP_resblock_dilation_sizes": [
|
261 |
+
[
|
262 |
+
1,
|
263 |
+
3,
|
264 |
+
5
|
265 |
+
],
|
266 |
+
[
|
267 |
+
1,
|
268 |
+
3,
|
269 |
+
5
|
270 |
+
],
|
271 |
+
[
|
272 |
+
1,
|
273 |
+
3,
|
274 |
+
5
|
275 |
+
]
|
276 |
+
],
|
277 |
+
"PSP_input_conv_kernel_size": 7,
|
278 |
+
"PSP_output_R_conv_kernel_size": 7,
|
279 |
+
"PSP_output_I_conv_kernel_size": 7,
|
280 |
+
}
|
281 |
+
},
|
282 |
+
},
|
283 |
+
"train": {
|
284 |
+
"fp16_run": true,
|
285 |
+
"learning_rate": 2e-4,
|
286 |
+
"betas": [
|
287 |
+
0.8,
|
288 |
+
0.99
|
289 |
+
],
|
290 |
+
"eps": 1e-9,
|
291 |
+
"batch_size": 16,
|
292 |
+
"lr_decay": 0.999875,
|
293 |
+
// "segment_size": 8192,
|
294 |
+
"init_lr_ratio": 1,
|
295 |
+
"warmup_epochs": 0,
|
296 |
+
"c_mel": 45,
|
297 |
+
"c_kl": 1.0,
|
298 |
+
"AdamW": {
|
299 |
+
"betas": [
|
300 |
+
0.8,
|
301 |
+
0.99
|
302 |
+
],
|
303 |
+
"eps": 1e-9,
|
304 |
+
}
|
305 |
+
}
|
306 |
+
}
|
config/vocoder.json
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"dataset": [
|
4 |
+
"LJSpeech",
|
5 |
+
"LibriTTS",
|
6 |
+
"opencpop",
|
7 |
+
"m4singer",
|
8 |
+
"svcc",
|
9 |
+
"svcceval",
|
10 |
+
"pjs",
|
11 |
+
"opensinger",
|
12 |
+
"popbutfy",
|
13 |
+
"nus48e",
|
14 |
+
"popcs",
|
15 |
+
"kising",
|
16 |
+
"csd",
|
17 |
+
"opera",
|
18 |
+
"vctk",
|
19 |
+
"lijian",
|
20 |
+
"cdmusiceval"
|
21 |
+
],
|
22 |
+
"task_type": "vocoder",
|
23 |
+
"preprocess": {
|
24 |
+
// acoustic features
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": false,
|
27 |
+
"extract_uv": false,
|
28 |
+
"extract_audio": true,
|
29 |
+
"extract_label": false,
|
30 |
+
"extract_one_hot": false,
|
31 |
+
"extract_amplitude_phase": false,
|
32 |
+
"pitch_extractor": "parselmouth",
|
33 |
+
// Settings for data preprocessing
|
34 |
+
"n_mel": 100,
|
35 |
+
"win_size": 1024,
|
36 |
+
"hop_size": 256,
|
37 |
+
"sample_rate": 24000,
|
38 |
+
"n_fft": 1024,
|
39 |
+
"fmin": 0,
|
40 |
+
"fmax": 12000,
|
41 |
+
"f0_min": 50,
|
42 |
+
"f0_max": 1100,
|
43 |
+
"pitch_bin": 256,
|
44 |
+
"pitch_max": 1100.0,
|
45 |
+
"pitch_min": 50.0,
|
46 |
+
"is_mu_law": false,
|
47 |
+
"bits": 8,
|
48 |
+
"cut_mel_frame": 32,
|
49 |
+
// Directory names of processed data or extracted features
|
50 |
+
"spk2id": "singers.json",
|
51 |
+
// Features used for model training
|
52 |
+
"use_mel": true,
|
53 |
+
"use_frame_pitch": false,
|
54 |
+
"use_uv": false,
|
55 |
+
"use_audio": true,
|
56 |
+
"use_label": false,
|
57 |
+
"use_one_hot": false,
|
58 |
+
"train_file": "train.json",
|
59 |
+
"valid_file": "test.json"
|
60 |
+
},
|
61 |
+
"train": {
|
62 |
+
"random_seed": 114514,
|
63 |
+
"batch_size": 64,
|
64 |
+
"gradient_accumulation_step": 1,
|
65 |
+
"max_epoch": 1000000,
|
66 |
+
"save_checkpoint_stride": [
|
67 |
+
20
|
68 |
+
],
|
69 |
+
"run_eval": [
|
70 |
+
true
|
71 |
+
],
|
72 |
+
"sampler": {
|
73 |
+
"holistic_shuffle": true,
|
74 |
+
"drop_last": true
|
75 |
+
},
|
76 |
+
"dataloader": {
|
77 |
+
"num_worker": 16,
|
78 |
+
"pin_memory": true
|
79 |
+
},
|
80 |
+
"tracker": [
|
81 |
+
"tensorboard"
|
82 |
+
],
|
83 |
+
}
|
84 |
+
}
|
egs/codec/FAcodec/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# FAcodec
|
2 |
+
|
3 |
+
Pytorch implementation for the training of FAcodec, which was proposed in paper [NaturalSpeech 3: Zero-Shot Speech Synthesis
|
4 |
+
with Factorized Codec and Diffusion Models](https://arxiv.org/pdf/2403.03100)
|
5 |
+
|
6 |
+
A dedicated repository for the FAcodec model can also be find [here](https://github.com/Plachtaa/FAcodec).
|
7 |
+
|
8 |
+
This implementation made some key improvements to the training pipeline, so that the requirements of any form of annotations, including
|
9 |
+
transcripts, phoneme alignments, and speaker labels, are eliminated. All you need are simply raw speech files.
|
10 |
+
With the new training pipeline, it is possible to train the model on more languages with more diverse timbre distributions.
|
11 |
+
We release the code for training and inference, including a pretrained checkpoint on 50k hours speech data with over 1 million speakers.
|
12 |
+
|
13 |
+
## Model storage
|
14 |
+
We provide pretrained checkpoints on 50k hours speech data.
|
15 |
+
|
16 |
+
| Model type | Link |
|
17 |
+
|-------------------|----------------------------------------------------------------------------------------------------------------------------------------|
|
18 |
+
| FAcodec | [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-FAcodec-blue)](https://huggingface.co/Plachta/FAcodec) |
|
19 |
+
|
20 |
+
## Demo
|
21 |
+
Try our model on [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Plachta/FAcodecV2)!
|
22 |
+
|
23 |
+
## Training
|
24 |
+
Prepare your data and put them under one folder, internal file structure does not matter.
|
25 |
+
Then, change the `dataset` in `./egs/codec/FAcodec/exp_custom_data.json` to the path of your data folder.
|
26 |
+
Finally, run the following command:
|
27 |
+
```bash
|
28 |
+
sh ./egs/codec/FAcodec/train.sh
|
29 |
+
```
|
30 |
+
|
31 |
+
## Inference
|
32 |
+
To reconstruct a speech file, run:
|
33 |
+
```bash
|
34 |
+
python ./bins/codec/inference.py --source <source_wav> --output_dir <output_dir> --checkpoint_path <checkpoint_path>
|
35 |
+
```
|
36 |
+
To use zero-shot voice conversion, run:
|
37 |
+
```bash
|
38 |
+
python ./bins/codec/inference.py --source <source_wav> --reference <reference_wav> --output_dir <output_dir> --checkpoint_path <checkpoint_path>
|
39 |
+
```
|
40 |
+
|
41 |
+
## Feature extraction
|
42 |
+
When running `./bins/codec/inference.py`, check the returned results of the `FAcodecInference` class: a tuple of `(quantized, codes)`
|
43 |
+
- `quantized` is the quantized representation of the input speech file.
|
44 |
+
- `quantized[0]` is the quantized representation of prosody
|
45 |
+
- `quantized[1]` is the quantized representation of content
|
46 |
+
|
47 |
+
- `codes` is the discrete code representation of the input speech file.
|
48 |
+
- `codes[0]` is the discrete code representation of prosody
|
49 |
+
- `codes[1]` is the discrete code representation of content
|
50 |
+
|
51 |
+
For the most clean content representation without any timbre, we suggest to use `codes[1][:, 0, :]`, which is the first layer of content codebooks.
|
egs/codec/FAcodec/exp_custom_data.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"exp_name": "facodec",
|
3 |
+
"model_type": "FAcodec",
|
4 |
+
|
5 |
+
"log_dir": "./runs/",
|
6 |
+
"log_interval": 10,
|
7 |
+
"save_interval": 1000,
|
8 |
+
"device": "cuda",
|
9 |
+
"epochs": 1000,
|
10 |
+
"batch_size": 4,
|
11 |
+
"batch_length": 100,
|
12 |
+
"max_len": 80,
|
13 |
+
"pretrained_model": "",
|
14 |
+
"load_only_params": false,
|
15 |
+
"F0_path": "modules/JDC/bst.t7",
|
16 |
+
"dataset": "/path/to/dataset",
|
17 |
+
"preprocess_params": {
|
18 |
+
"sr": 24000,
|
19 |
+
"frame_rate": 80,
|
20 |
+
"duration_range": [1.0, 25.0],
|
21 |
+
"spect_params": {
|
22 |
+
"n_fft": 2048,
|
23 |
+
"win_length": 1200,
|
24 |
+
"hop_length": 300,
|
25 |
+
"n_mels": 80
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"train": {
|
29 |
+
"gradient_accumulation_step": 1,
|
30 |
+
"batch_size": 1,
|
31 |
+
"save_checkpoint_stride": [
|
32 |
+
20
|
33 |
+
],
|
34 |
+
"random_seed": 1234,
|
35 |
+
"max_epoch": -1,
|
36 |
+
"max_frame_len": 80,
|
37 |
+
"tracker": [
|
38 |
+
"tensorboard"
|
39 |
+
],
|
40 |
+
"run_eval": [
|
41 |
+
false
|
42 |
+
],
|
43 |
+
"sampler": {
|
44 |
+
"holistic_shuffle": true,
|
45 |
+
"drop_last": true
|
46 |
+
},
|
47 |
+
"dataloader": {
|
48 |
+
"num_worker": 0,
|
49 |
+
"pin_memory": true
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"model_params": {
|
53 |
+
"causal": true,
|
54 |
+
"lstm": 2,
|
55 |
+
"norm_f0": true,
|
56 |
+
"use_gr_content_f0": false,
|
57 |
+
"use_gr_prosody_phone": false,
|
58 |
+
"use_gr_timbre_prosody": false,
|
59 |
+
"separate_prosody_encoder": true,
|
60 |
+
"n_c_codebooks": 2,
|
61 |
+
"timbre_norm": true,
|
62 |
+
"use_gr_content_global_f0": true,
|
63 |
+
"DAC": {
|
64 |
+
"encoder_dim": 64,
|
65 |
+
"encoder_rates": [2, 5, 5, 6],
|
66 |
+
"decoder_dim": 1536,
|
67 |
+
"decoder_rates": [6, 5, 5, 2],
|
68 |
+
"sr": 24000
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"loss_params": {
|
72 |
+
"base_lr": 0.0001,
|
73 |
+
"warmup_steps": 200,
|
74 |
+
"discriminator_iter_start": 2000,
|
75 |
+
"lambda_spk": 1.0,
|
76 |
+
"lambda_mel": 45,
|
77 |
+
"lambda_f0": 1.0,
|
78 |
+
"lambda_uv": 1.0
|
79 |
+
}
|
80 |
+
}
|
egs/codec/FAcodec/train.sh
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export PYTHONPATH="./"
|
2 |
+
|
3 |
+
######## Build Experiment Environment ###########
|
4 |
+
exp_dir="./egs/codecs/FAcodec"
|
5 |
+
echo exp_dir: $exp_dir
|
6 |
+
work_dir="./" # Amphion root folder
|
7 |
+
echo work_dir: $work_dir
|
8 |
+
|
9 |
+
export WORK_DIR=$work_dir
|
10 |
+
export PYTHONPATH=$work_dir
|
11 |
+
export PYTHONIOENCODING=UTF-8
|
12 |
+
|
13 |
+
######## Set Config File Dir ##############
|
14 |
+
if [ -z "$exp_config" ]; then
|
15 |
+
exp_config="${exp_dir}"/exp_libritts.json
|
16 |
+
fi
|
17 |
+
echo "Exprimental Configuration File: $exp_config"
|
18 |
+
|
19 |
+
######## Set the experiment name ##########
|
20 |
+
exp_name="facodec"
|
21 |
+
|
22 |
+
port=53333 # a random number for port
|
23 |
+
|
24 |
+
######## Train Model ###########
|
25 |
+
echo "Experiment Name: $exp_name"
|
26 |
+
accelerate launch --main_process_port $port "${work_dir}"/bins/codec/train.py --config $exp_config \
|
27 |
+
--exp_name $exp_name --log_level debug $1
|
egs/datasets/README.md
ADDED
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Datasets Format
|
2 |
+
|
3 |
+
Amphion support the following academic datasets (sort alphabetically):
|
4 |
+
|
5 |
+
- [Datasets Format](#datasets-format)
|
6 |
+
- [AudioCaps](#audiocaps)
|
7 |
+
- [CSD](#csd)
|
8 |
+
- [CustomSVCDataset](#customsvcdataset)
|
9 |
+
- [Hi-Fi TTS](#hifitts)
|
10 |
+
- [KiSing](#kising)
|
11 |
+
- [LibriLight](#librilight)
|
12 |
+
- [LibriTTS](#libritts)
|
13 |
+
- [LJSpeech](#ljspeech)
|
14 |
+
- [M4Singer](#m4singer)
|
15 |
+
- [NUS-48E](#nus-48e)
|
16 |
+
- [Opencpop](#opencpop)
|
17 |
+
- [OpenSinger](#opensinger)
|
18 |
+
- [Opera](#opera)
|
19 |
+
- [PopBuTFy](#popbutfy)
|
20 |
+
- [PopCS](#popcs)
|
21 |
+
- [PJS](#pjs)
|
22 |
+
- [SVCC](#svcc)
|
23 |
+
- [VCTK](#vctk)
|
24 |
+
|
25 |
+
The downloading link and the file structure tree of each dataset is displayed as follows.
|
26 |
+
|
27 |
+
> **Note:** When using Docker to run Amphion, mount the dataset to the container is necessary after downloading. Check [Mount dataset in Docker container](./docker.md) for more details.
|
28 |
+
|
29 |
+
## AudioCaps
|
30 |
+
|
31 |
+
AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information.
|
32 |
+
|
33 |
+
Download AudioCaps dataset [here](https://github.com/cdjkim/audiocaps). The file structure looks like below:
|
34 |
+
|
35 |
+
```plaintext
|
36 |
+
[AudioCaps dataset path]
|
37 |
+
┣ AudioCpas
|
38 |
+
┃ ┣ wav
|
39 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
40 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
41 |
+
┃ ┃ ┣ ...
|
42 |
+
```
|
43 |
+
|
44 |
+
## CSD
|
45 |
+
|
46 |
+
Download the official CSD dataset [here](https://zenodo.org/records/4785016). The file structure looks like below:
|
47 |
+
|
48 |
+
```plaintext
|
49 |
+
[CSD dataset path]
|
50 |
+
┣ english
|
51 |
+
┣ korean
|
52 |
+
┣ utterances
|
53 |
+
┃ ┣ en001a
|
54 |
+
┃ ┃ ┣ {UtterenceID}.wav
|
55 |
+
┃ ┣ en001b
|
56 |
+
┃ ┣ en002a
|
57 |
+
┃ ┣ en002b
|
58 |
+
┃ ┣ ...
|
59 |
+
┣ README
|
60 |
+
```
|
61 |
+
|
62 |
+
## CustomSVCDataset
|
63 |
+
|
64 |
+
We support custom dataset for Singing Voice Conversion. Organize your data in the following structure to construct your own dataset:
|
65 |
+
|
66 |
+
```plaintext
|
67 |
+
[Your Custom Dataset Path]
|
68 |
+
┣ singer1
|
69 |
+
┃ ┣ song1
|
70 |
+
┃ ┃ ┣ utterance1.wav
|
71 |
+
┃ ┃ ┣ utterance2.wav
|
72 |
+
┃ ┃ ┣ ...
|
73 |
+
┃ ┣ song2
|
74 |
+
┃ ┣ ...
|
75 |
+
┣ singer2
|
76 |
+
┣ ...
|
77 |
+
```
|
78 |
+
|
79 |
+
|
80 |
+
## Hi-Fi TTS
|
81 |
+
|
82 |
+
Download the official Hi-Fi TTS dataset [here](https://www.openslr.org/109/). The file structure looks like below:
|
83 |
+
|
84 |
+
```plaintext
|
85 |
+
[Hi-Fi TTS dataset path]
|
86 |
+
┣ audio
|
87 |
+
┃ ┣ 11614_other {Speaker_ID}_{SNR_subset}
|
88 |
+
┃ ┃ ┣ 10547 {Book_ID}
|
89 |
+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0001.flac
|
90 |
+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0003.flac
|
91 |
+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0004.flac
|
92 |
+
┃ ┃ ┃ ┣ ...
|
93 |
+
┃ ┃ ┣ ...
|
94 |
+
┃ ┣ ...
|
95 |
+
┣ 92_manifest_clean_dev.json
|
96 |
+
┣ 92_manifest_clean_test.json
|
97 |
+
┣ 92_manifest_clean_train.json
|
98 |
+
┣ ...
|
99 |
+
┣ {Speaker_ID}_manifest_{SNR_subset}_{dataset_split}.json
|
100 |
+
┣ ...
|
101 |
+
┣ books_bandwidth.tsv
|
102 |
+
┣ LICENSE.txt
|
103 |
+
┣ readers_books_clean.txt
|
104 |
+
┣ readers_books_other.txt
|
105 |
+
┣ README.txt
|
106 |
+
|
107 |
+
```
|
108 |
+
|
109 |
+
## KiSing
|
110 |
+
|
111 |
+
Download the official KiSing dataset [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure looks like below:
|
112 |
+
|
113 |
+
```plaintext
|
114 |
+
[KiSing dataset path]
|
115 |
+
┣ clean
|
116 |
+
┃ ┣ 421
|
117 |
+
┃ ┣ 422
|
118 |
+
┃ ┣ ...
|
119 |
+
```
|
120 |
+
|
121 |
+
## LibriLight
|
122 |
+
|
123 |
+
Download the official LibriLight dataset [here](https://github.com/facebookresearch/libri-light). The file structure looks like below:
|
124 |
+
|
125 |
+
```plaintext
|
126 |
+
[LibriTTS dataset path]
|
127 |
+
┣ small (Subset)
|
128 |
+
┃ ┣ 100 {Speaker_ID}
|
129 |
+
┃ ┃ ┣ sea_fairies_0812_librivox_64kb_mp3 {Chapter_ID}
|
130 |
+
┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.flac
|
131 |
+
┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.flac
|
132 |
+
┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.flac
|
133 |
+
┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.flac
|
134 |
+
┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.json
|
135 |
+
┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.json
|
136 |
+
┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.json
|
137 |
+
┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.json
|
138 |
+
┃ ┃ ┃ ┣ ...
|
139 |
+
┃ ┃ ┣ ...
|
140 |
+
┃ ┣ ...
|
141 |
+
┣ medium (Subset)
|
142 |
+
┣ ...
|
143 |
+
```
|
144 |
+
|
145 |
+
## LibriTTS
|
146 |
+
|
147 |
+
Download the official LibriTTS dataset [here](https://www.openslr.org/60/). The file structure looks like below:
|
148 |
+
|
149 |
+
```plaintext
|
150 |
+
[LibriTTS dataset path]
|
151 |
+
┣ BOOKS.txt
|
152 |
+
┣ CHAPTERS.txt
|
153 |
+
┣ eval_sentences10.tsv
|
154 |
+
┣ LICENSE.txt
|
155 |
+
┣ NOTE.txt
|
156 |
+
┣ reader_book.tsv
|
157 |
+
┣ README_librispeech.txt
|
158 |
+
┣ README_libritts.txt
|
159 |
+
┣ speakers.tsv
|
160 |
+
┣ SPEAKERS.txt
|
161 |
+
┣ dev-clean (Subset)
|
162 |
+
┃ ┣ 1272{Speaker_ID}
|
163 |
+
┃ ┃ ┣ 128104 {Chapter_ID}
|
164 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
|
165 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
|
166 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
|
167 |
+
┃ ┃ ┃ ┣ ...
|
168 |
+
┃ ┃ ┃ ┣ 1272_128104.book.tsv
|
169 |
+
┃ ┃ ┃ ┣ 1272_128104.trans.tsv
|
170 |
+
┃ ┃ ┣ ...
|
171 |
+
┃ ┣ ...
|
172 |
+
┣ dev-other (Subset)
|
173 |
+
┃ ┣ 116 (Speaker)
|
174 |
+
┃ ┃ ┣ 288045 {Chapter_ID}
|
175 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
|
176 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
|
177 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
|
178 |
+
┃ ┃ ┃ ┣ ...
|
179 |
+
┃ ┃ ┃ ┣ 116_288045.book.tsv
|
180 |
+
┃ ┃ ┃ ┣ 116_288045.trans.tsv
|
181 |
+
┃ ┃ ┣ ...
|
182 |
+
┃ ┣ ...
|
183 |
+
┃ ┣ ...
|
184 |
+
┣ test-clean (Subset)
|
185 |
+
┃ �� {Speaker_ID}
|
186 |
+
┃ ┃ ┣ {Chapter_ID}
|
187 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
188 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
189 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
190 |
+
┃ ┃ ┃ ┣ ...
|
191 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
192 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
193 |
+
┃ ┃ ┣ ...
|
194 |
+
┃ ┣ ...
|
195 |
+
┣ test-other
|
196 |
+
┃ ┣ {Speaker_ID}
|
197 |
+
┃ ┃ ┣ {Chapter_ID}
|
198 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
199 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
200 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
201 |
+
┃ ┃ ┃ ┣ ...
|
202 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
203 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
204 |
+
┃ ┃ ┣ ...
|
205 |
+
┃ ┣ ...
|
206 |
+
┣ train-clean-100
|
207 |
+
┃ ┣ {Speaker_ID}
|
208 |
+
┃ ┃ ┣ {Chapter_ID}
|
209 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
210 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
211 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
212 |
+
┃ ┃ ┃ ┣ ...
|
213 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
214 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
215 |
+
┃ ┃ ┣ ...
|
216 |
+
┃ ┣ ...
|
217 |
+
┣ train-clean-360
|
218 |
+
┃ ┣ {Speaker_ID}
|
219 |
+
┃ ┃ ┣ {Chapter_ID}
|
220 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
221 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
222 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
223 |
+
┃ ┃ ┃ ┣ ...
|
224 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
225 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
226 |
+
┃ ┃ ┣ ...
|
227 |
+
┃ ┣ ...
|
228 |
+
┣ train-other-500
|
229 |
+
┃ ┣ {Speaker_ID}
|
230 |
+
┃ ┃ ┣ {Chapter_ID}
|
231 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
232 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
233 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
234 |
+
┃ ┃ ┃ ┣ ...
|
235 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
236 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
237 |
+
┃ ┃ ┣ ...
|
238 |
+
┃ ┣ ...
|
239 |
+
```
|
240 |
+
|
241 |
+
## LJSpeech
|
242 |
+
|
243 |
+
Download the official LJSpeech dataset [here](https://keithito.com/LJ-Speech-Dataset/). The file structure looks like below:
|
244 |
+
|
245 |
+
```plaintext
|
246 |
+
[LJSpeech dataset path]
|
247 |
+
┣ metadata.csv
|
248 |
+
┣ wavs
|
249 |
+
┃ ┣ LJ001-0001.wav
|
250 |
+
┃ ┣ LJ001-0002.wav
|
251 |
+
┃ ┣ ...
|
252 |
+
┣ README
|
253 |
+
```
|
254 |
+
|
255 |
+
## M4Singer
|
256 |
+
|
257 |
+
Download the official M4Singer dataset [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure looks like below:
|
258 |
+
|
259 |
+
```plaintext
|
260 |
+
[M4Singer dataset path]
|
261 |
+
┣ {Singer_1}#{Song_1}
|
262 |
+
┃ ┣ 0000.mid
|
263 |
+
┃ ┣ 0000.TextGrid
|
264 |
+
┃ ┣ 0000.wav
|
265 |
+
┃ ┣ ...
|
266 |
+
┣ {Singer_1}#{Song_2}
|
267 |
+
┣ ...
|
268 |
+
┣ {Singer_2}#{Song_1}
|
269 |
+
┣ {Singer_2}#{Song_2}
|
270 |
+
┣ ...
|
271 |
+
┗ meta.json
|
272 |
+
```
|
273 |
+
|
274 |
+
## NUS-48E
|
275 |
+
|
276 |
+
Download the official NUS-48E dataset [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure looks like below:
|
277 |
+
|
278 |
+
```plaintext
|
279 |
+
[NUS-48E dataset path]
|
280 |
+
┣ {SpeakerID}
|
281 |
+
┃ ┣ read
|
282 |
+
┃ ┃ ┣ {SongID}.txt
|
283 |
+
┃ ┃ ┣ {SongID}.wav
|
284 |
+
┃ ┃ ┣ ...
|
285 |
+
┃ ┣ sing
|
286 |
+
┃ ┃ ┣ {SongID}.txt
|
287 |
+
┃ ┃ ┣ {SongID}.wav
|
288 |
+
┃ ┃ ┣ ...
|
289 |
+
┣ ...
|
290 |
+
┣ README.txt
|
291 |
+
|
292 |
+
```
|
293 |
+
|
294 |
+
## Opencpop
|
295 |
+
|
296 |
+
Download the official Opencpop dataset [here](https://wenet.org.cn/opencpop/). The file structure looks like below:
|
297 |
+
|
298 |
+
```plaintext
|
299 |
+
[Opencpop dataset path]
|
300 |
+
┣ midis
|
301 |
+
┃ ┣ 2001.midi
|
302 |
+
┃ ┣ 2002.midi
|
303 |
+
┃ ┣ 2003.midi
|
304 |
+
┃ ┣ ...
|
305 |
+
┣ segments
|
306 |
+
┃ ┣ wavs
|
307 |
+
┃ ┃ ┣ 2001000001.wav
|
308 |
+
┃ ┃ ┣ 2001000002.wav
|
309 |
+
┃ ┃ ┣ 2001000003.wav
|
310 |
+
┃ ┃ ┣ ...
|
311 |
+
┃ ┣ test.txt
|
312 |
+
┃ ┣ train.txt
|
313 |
+
┃ ┗ transcriptions.txt
|
314 |
+
┣ textgrids
|
315 |
+
┃ ┣ 2001.TextGrid
|
316 |
+
┃ ┣ 2002.TextGrid
|
317 |
+
┃ ┣ 2003.TextGrid
|
318 |
+
┃ ┣ ...
|
319 |
+
┣ wavs
|
320 |
+
┃ ┣ 2001.wav
|
321 |
+
┃ ┣ 2002.wav
|
322 |
+
┃ ┣ 2003.wav
|
323 |
+
┃ ┣ ...
|
324 |
+
┣ TERMS_OF_ACCESS
|
325 |
+
┗ readme.md
|
326 |
+
```
|
327 |
+
|
328 |
+
## OpenSinger
|
329 |
+
|
330 |
+
Download the official OpenSinger dataset [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure looks like below:
|
331 |
+
|
332 |
+
```plaintext
|
333 |
+
[OpenSinger dataset path]
|
334 |
+
┣ ManRaw
|
335 |
+
┃ ┣ {Singer_1}_{Song_1}
|
336 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
|
337 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
|
338 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
|
339 |
+
┃ ┃ ┣ ...
|
340 |
+
┃ ┣ {Singer_1}_{Song_2}
|
341 |
+
┃ ┣ ...
|
342 |
+
┣ WomanRaw
|
343 |
+
┣ LICENSE
|
344 |
+
┗ README.md
|
345 |
+
```
|
346 |
+
|
347 |
+
## Opera
|
348 |
+
|
349 |
+
Download the official Opera dataset [here](http://isophonics.net/SingingVoiceDataset). The file structure looks like below:
|
350 |
+
|
351 |
+
```plaintext
|
352 |
+
[Opera dataset path]
|
353 |
+
┣ monophonic
|
354 |
+
┃ ┣ chinese
|
355 |
+
┃ ┃ ┣ {Gender}_{SingerID}
|
356 |
+
┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
|
357 |
+
┃ ┃ ┃ ┣ ...
|
358 |
+
┃ ┃ ┣ ...
|
359 |
+
┃ ┣ western
|
360 |
+
┣ polyphonic
|
361 |
+
┃ ┣ chinese
|
362 |
+
┃ ┣ western
|
363 |
+
┣ CrossculturalDataSet.xlsx
|
364 |
+
```
|
365 |
+
|
366 |
+
## PopBuTFy
|
367 |
+
|
368 |
+
Download the official PopBuTFy dataset [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure looks like below:
|
369 |
+
|
370 |
+
```plaintext
|
371 |
+
[PopBuTFy dataset path]
|
372 |
+
┣ data
|
373 |
+
┃ ┣ {SingerID}#singing#{SongName}_Amateur
|
374 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
|
375 |
+
┃ ┃ ┣ ...
|
376 |
+
┃ ┣ {SingerID}#singing#{SongName}_Professional
|
377 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
|
378 |
+
┃ ┃ ┣ ...
|
379 |
+
┣ text_labels
|
380 |
+
┗ TERMS_OF_ACCESS
|
381 |
+
```
|
382 |
+
|
383 |
+
## PopCS
|
384 |
+
|
385 |
+
Download the official PopCS dataset [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure looks like below:
|
386 |
+
|
387 |
+
```plaintext
|
388 |
+
[PopCS dataset path]
|
389 |
+
┣ popcs
|
390 |
+
┃ ┣ popcs-{SongName}
|
391 |
+
┃ ┃ ┣ {UtteranceID}_ph.txt
|
392 |
+
┃ ┃ ┣ {UtteranceID}_wf0.wav
|
393 |
+
┃ ┃ ┣ {UtteranceID}.TextGrid
|
394 |
+
┃ ┃ ┣ {UtteranceID}.txt
|
395 |
+
┃ ┃ ┣ ...
|
396 |
+
┃ ┣ ...
|
397 |
+
┗ TERMS_OF_ACCESS
|
398 |
+
```
|
399 |
+
|
400 |
+
## PJS
|
401 |
+
|
402 |
+
Download the official PJS dataset [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure looks like below:
|
403 |
+
|
404 |
+
```plaintext
|
405 |
+
[PJS dataset path]
|
406 |
+
┣ PJS_corpus_ver1.1
|
407 |
+
┃ ┣ background_noise
|
408 |
+
┃ ┣ pjs{SongID}
|
409 |
+
┃ ┃ ┣ pjs{SongID}_song.wav
|
410 |
+
┃ ┃ ┣ pjs{SongID}_speech.wav
|
411 |
+
┃ ┃ ┣ pjs{SongID}.lab
|
412 |
+
┃ ┃ ┣ pjs{SongID}.mid
|
413 |
+
┃ ┃ ┣ pjs{SongID}.musicxml
|
414 |
+
┃ ┃ ┣ pjs{SongID}.txt
|
415 |
+
┃ ┣ ...
|
416 |
+
```
|
417 |
+
|
418 |
+
## SVCC
|
419 |
+
|
420 |
+
Download the official SVCC dataset [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure looks like below:
|
421 |
+
|
422 |
+
```plaintext
|
423 |
+
[SVCC dataset path]
|
424 |
+
┣ Data
|
425 |
+
┃ ┣ CDF1
|
426 |
+
┃ ┃ ┣ 10001.wav
|
427 |
+
┃ ┃ ┣ 10002.wav
|
428 |
+
┃ ┃ ┣ ...
|
429 |
+
┃ ┣ CDM1
|
430 |
+
┃ ┣ IDF1
|
431 |
+
┃ ┣ IDM1
|
432 |
+
┗ README.md
|
433 |
+
```
|
434 |
+
|
435 |
+
## VCTK
|
436 |
+
|
437 |
+
Download the official VCTK dataset [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure looks like below:
|
438 |
+
|
439 |
+
```plaintext
|
440 |
+
[VCTK dataset path]
|
441 |
+
┣ txt
|
442 |
+
┃ ┣ {Speaker_1}
|
443 |
+
┃ ┃ ┣ {Speaker_1}_001.txt
|
444 |
+
┃ ┃ ┣ {Speaker_1}_002.txt
|
445 |
+
┃ ┃ ┣ ...
|
446 |
+
┃ ┣ {Speaker_2}
|
447 |
+
┃ ┣ ...
|
448 |
+
┣ wav48_silence_trimmed
|
449 |
+
┃ ┣ {Speaker_1}
|
450 |
+
┃ ┃ ┣ {Speaker_1}_001_mic1.flac
|
451 |
+
┃ ┃ ┣ {Speaker_1}_001_mic2.flac
|
452 |
+
┃ ┃ ┣ {Speaker_1}_002_mic1.flac
|
453 |
+
┃ ┃ ┣ ...
|
454 |
+
┃ ┣ {Speaker_2}
|
455 |
+
┃ ┣ ...
|
456 |
+
┣ speaker-info.txt
|
457 |
+
┗ update.txt
|
458 |
+
```
|
egs/datasets/docker.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Mount dataset in Docker container
|
2 |
+
|
3 |
+
When using Docker to run Amphion, mount the dataset to the container first is needed. It is recommend to mounte dataset to `/mnt/<dataset_name>` in the container, where `<dataset_name>` is the name of the dataset.
|
4 |
+
|
5 |
+
When configuring the dataset in `exp_config.json`, you should use the path `/mnt/<dataset_name>` as the dataset path instead of the actual path on your host machine. Otherwise, the dataset will not be found in the container.
|
6 |
+
|
7 |
+
## Mount Example
|
8 |
+
|
9 |
+
```bash
|
10 |
+
docker run --runtime=nvidia --gpus all -it -v .:/app -v <dataset_path1>:/mnt/<dataset_name1> -v <dataset_path2>:/mnt/<dataset_name2> amphion
|
11 |
+
```
|
12 |
+
|
13 |
+
For example, if you want to use the `LJSpeech` dataset, you can mount the dataset to `/mnt/LJSpeech` in the container.
|
14 |
+
|
15 |
+
```bash
|
16 |
+
docker run --runtime=nvidia --gpus all -it -v .:/app -v /home/username/datasets/LJSpeech:/mnt/LJSpeech amphion
|
17 |
+
```
|
18 |
+
|
19 |
+
If you want to use multiple datasets, you can mount them to different directories in the container by adding more `-v` options.
|
egs/metrics/README.md
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Evaluation Recipe
|
2 |
+
|
3 |
+
## Supported Evaluation Metrics
|
4 |
+
|
5 |
+
Until now, Amphion Evaluation has supported the following objective metrics:
|
6 |
+
|
7 |
+
- **F0 Modeling**:
|
8 |
+
- F0 Pearson Coefficients (FPC)
|
9 |
+
- F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
|
10 |
+
- F0 Root Mean Square Error (F0RMSE)
|
11 |
+
- Voiced/Unvoiced F1 Score (V/UV F1)
|
12 |
+
- **Energy Modeling**:
|
13 |
+
- Energy Root Mean Square Error (EnergyRMSE)
|
14 |
+
- Energy Pearson Coefficients (EnergyPC)
|
15 |
+
- **Intelligibility**:
|
16 |
+
- Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
|
17 |
+
- Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
|
18 |
+
- **Spectrogram Distortion**:
|
19 |
+
- Frechet Audio Distance (FAD)
|
20 |
+
- Mel Cepstral Distortion (MCD)
|
21 |
+
- Multi-Resolution STFT Distance (MSTFT)
|
22 |
+
- Perceptual Evaluation of Speech Quality (PESQ)
|
23 |
+
- Short Time Objective Intelligibility (STOI)
|
24 |
+
- Scale Invariant Signal to Distortion Ratio (SISDR)
|
25 |
+
- Scale Invariant Signal to Noise Ratio (SISNR)
|
26 |
+
- **Speaker Similarity**:
|
27 |
+
- Cosine similarity based on:
|
28 |
+
- [Rawnet3](https://github.com/Jungjee/RawNet)
|
29 |
+
- [Resemblyzer](https://github.com/resemble-ai/Resemblyzer)
|
30 |
+
- [WavLM](https://huggingface.co/microsoft/wavlm-base-plus-sv)
|
31 |
+
|
32 |
+
We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
|
33 |
+
|
34 |
+
1. Pretrained Models Preparation
|
35 |
+
2. Audio Data Preparation
|
36 |
+
3. Evaluation
|
37 |
+
|
38 |
+
## 1. Pretrained Models Preparation
|
39 |
+
|
40 |
+
If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
|
41 |
+
|
42 |
+
## 2. Audio Data Preparation
|
43 |
+
|
44 |
+
Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
|
45 |
+
|
46 |
+
```plaintext
|
47 |
+
┣ {ref_dir}
|
48 |
+
┃ ┣ sample1.wav
|
49 |
+
┃ ┣ sample2.wav
|
50 |
+
┣ {gen_dir}
|
51 |
+
┃ ┣ sample1.wav
|
52 |
+
┃ ┣ sample2.wav
|
53 |
+
```
|
54 |
+
|
55 |
+
You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
|
56 |
+
|
57 |
+
## 3. Evaluation
|
58 |
+
|
59 |
+
Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
|
60 |
+
|
61 |
+
```bash
|
62 |
+
cd Amphion
|
63 |
+
sh egs/metrics/run.sh \
|
64 |
+
--reference_folder [Your path to the reference audios] \
|
65 |
+
--generated_folder [Your path to the generated audios] \
|
66 |
+
--dump_folder [Your path to dump the objective results] \
|
67 |
+
--metrics [The metrics you need] \
|
68 |
+
--fs [Optional. To calculate all metrics in the specified sampling rate] \
|
69 |
+
--similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
|
70 |
+
--similarity_mode [Optional. To choose the mode for calculating the speaker similarity. "pairwith" for calculating a series of ground truth / prediction audio pairs to obtain the speaker similarity, and "overall" for computing the average score with all possible pairs between the refernece folder and generated folder. Default to "pairwith"] \
|
71 |
+
--intelligibility_mode [Optionoal. To choose the mode for computing CER and WER. "gt_audio" means selecting the recognition content of the reference audio as the target, "gt_content" means using transcription as the target. Default to "gt_audio"] \
|
72 |
+
--ltr_path [Optional. Path to the transcription file] \
|
73 |
+
--language [Optional. Language for computing CER and WER. Default to "english"]
|
74 |
+
```
|
75 |
+
|
76 |
+
As for the metrics, an example is provided below:
|
77 |
+
|
78 |
+
```bash
|
79 |
+
--metrics "mcd pesq fad"
|
80 |
+
```
|
81 |
+
|
82 |
+
All currently available metrics keywords are listed below:
|
83 |
+
|
84 |
+
| Keys | Description |
|
85 |
+
| ------------------------- | ------------------------------------------ |
|
86 |
+
| `fpc` | F0 Pearson Coefficients |
|
87 |
+
| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
|
88 |
+
| `f0rmse` | F0 Root Mean Square Error |
|
89 |
+
| `v_uv_f1` | Voiced/Unvoiced F1 Score |
|
90 |
+
| `energy_rmse` | Energy Root Mean Square Error |
|
91 |
+
| `energy_pc` | Energy Pearson Coefficients |
|
92 |
+
| `cer` | Character Error Rate |
|
93 |
+
| `wer` | Word Error Rate |
|
94 |
+
| `similarity` | Speaker Similarity
|
95 |
+
| `fad` | Frechet Audio Distance |
|
96 |
+
| `mcd` | Mel Cepstral Distortion |
|
97 |
+
| `mstft` | Multi-Resolution STFT Distance |
|
98 |
+
| `pesq` | Perceptual Evaluation of Speech Quality |
|
99 |
+
| `si_sdr` | Scale Invariant Signal to Distortion Ratio |
|
100 |
+
| `si_snr` | Scale Invariant Signal to Noise Ratio |
|
101 |
+
| `stoi` | Short Time Objective Intelligibility |
|
102 |
+
|
103 |
+
For example, if want to calculate the speaker similarity between the synthesized audio and the reference audio with the same content, run:
|
104 |
+
|
105 |
+
```bash
|
106 |
+
sh egs/metrics/run.sh \
|
107 |
+
--reference_folder [Your path to the reference audios] \
|
108 |
+
--generated_folder [Your path to the generated audios] \
|
109 |
+
--dump_folder [Your path to dump the objective results] \
|
110 |
+
--metrics "similarity" \
|
111 |
+
--similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
|
112 |
+
--similarity_mode "pairwith" \
|
113 |
+
```
|
114 |
+
|
115 |
+
If you don't have the reference audio with the same content, run the following to get the conteng-free similarity score:
|
116 |
+
|
117 |
+
```bash
|
118 |
+
sh egs/metrics/run.sh \
|
119 |
+
--reference_folder [Your path to the reference audios] \
|
120 |
+
--generated_folder [Your path to the generated audios] \
|
121 |
+
--dump_folder [Your path to dump the objective results] \
|
122 |
+
--metrics "similarity" \
|
123 |
+
--similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
|
124 |
+
--similarity_mode "overall" \
|
125 |
+
```
|
126 |
+
|
127 |
+
## Troubleshooting
|
128 |
+
### FAD (Using Offline Models)
|
129 |
+
If your system is unable to access huggingface.co from the terminal, you might run into an error like "OSError: Can't load tokenizer for ...". To work around this, follow these steps to use local models:
|
130 |
+
|
131 |
+
1. Download the [bert-base-uncased](https://huggingface.co/bert-base-uncased), [roberta-base](https://huggingface.co/roberta-base), and [facebook/bart-base](https://huggingface.co/facebook/bart-base) models from `huggingface.co`. Ensure that the models are complete and uncorrupted. Place these directories within `Amphion/pretrained`. For a detailed file structure reference, see [This README](../../pretrained/README.md#optional-model-dependencies-for-evaluation) under `Amphion/pretrained`.
|
132 |
+
2. Inside the `Amphion/pretrained` directory, create a bash script with the content outlined below. This script will automatically update the tokenizer paths used by your system:
|
133 |
+
```bash
|
134 |
+
#!/bin/bash
|
135 |
+
|
136 |
+
BERT_DIR="bert-base-uncased"
|
137 |
+
ROBERTA_DIR="roberta-base"
|
138 |
+
BART_DIR="facebook/bart-base"
|
139 |
+
PYTHON_SCRIPT="[YOUR ENV PATH]/lib/python3.9/site-packages/laion_clap/training/data.py"
|
140 |
+
|
141 |
+
update_tokenizer_path() {
|
142 |
+
local dir_name=$1
|
143 |
+
local tokenizer_variable=$2
|
144 |
+
local full_path
|
145 |
+
|
146 |
+
if [ -d "$dir_name" ]; then
|
147 |
+
full_path=$(realpath "$dir_name")
|
148 |
+
if [ -f "$PYTHON_SCRIPT" ]; then
|
149 |
+
sed -i "s|${tokenizer_variable}.from_pretrained(\".*\")|${tokenizer_variable}.from_pretrained(\"$full_path\")|" "$PYTHON_SCRIPT"
|
150 |
+
echo "Updated ${tokenizer_variable} path to $full_path."
|
151 |
+
else
|
152 |
+
echo "Error: The specified Python script does not exist."
|
153 |
+
exit 1
|
154 |
+
fi
|
155 |
+
else
|
156 |
+
echo "Error: The directory $dir_name does not exist in the current directory."
|
157 |
+
exit 1
|
158 |
+
fi
|
159 |
+
}
|
160 |
+
|
161 |
+
update_tokenizer_path "$BERT_DIR" "BertTokenizer"
|
162 |
+
update_tokenizer_path "$ROBERTA_DIR" "RobertaTokenizer"
|
163 |
+
update_tokenizer_path "$BART_DIR" "BartTokenizer"
|
164 |
+
|
165 |
+
echo "BERT, BART and RoBERTa Python script paths have been updated."
|
166 |
+
|
167 |
+
```
|
168 |
+
|
169 |
+
3. The script provided is intended to adjust the tokenizer paths in the `data.py` file, found under `/lib/python3.9/site-packages/laion_clap/training/`, within your specific environment. For those utilizing conda, you can determine your environment path by running `conda info --envs`. Then, substitute `[YOUR ENV PATH]` in the script with this path. If your environment is configured differently, you'll need to update the `PYTHON_SCRIPT` variable to correctly point to the `data.py` file.
|
170 |
+
4. Run the script. If it executes successfully, the tokenizer paths will be updated, allowing them to be loaded locally.
|
171 |
+
|
172 |
+
### WavLM-based Speaker Similarity (Using Offline Models)
|
173 |
+
|
174 |
+
If your system is unable to access huggingface.co from the terminal and you want to calculate `WavLM` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
|
egs/metrics/run.sh
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $exp_dir))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Parse the Given Parameters from the Commond ###########
|
15 |
+
options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs:,align_method:,energy_db_scale:,f0_subtract_mean:,similarity_model:,similarity_mode:,ltr_path:,intelligibility_mode:,language: -- "$@")
|
16 |
+
eval set -- "$options"
|
17 |
+
|
18 |
+
while true; do
|
19 |
+
case $1 in
|
20 |
+
# Visible GPU machines. The default value is "0".
|
21 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
22 |
+
# Reference Audio Folder
|
23 |
+
--reference_folder) shift; ref_dir=$1 ; shift ;;
|
24 |
+
# Generated Audio Folder
|
25 |
+
--generated_folder) shift; deg_dir=$1 ; shift ;;
|
26 |
+
# Result Dumping Folder
|
27 |
+
--dump_folder) shift; dump_dir=$1 ; shift ;;
|
28 |
+
# Metrics to Compute
|
29 |
+
--metrics) shift; metrics=$1 ; shift ;;
|
30 |
+
# Sampling Rate
|
31 |
+
--fs) shift; fs=$1 ; shift ;;
|
32 |
+
|
33 |
+
# Method for aligning F0. The default value is "cut"
|
34 |
+
--align_method) shift; align_method=$1 ; shift ;;
|
35 |
+
# Method for normalizing F0. The default value is "True"
|
36 |
+
--f0_subtract_mean) shift; f0_subtract_mean=$1 ; shift ;;
|
37 |
+
# Method for normalizing Energy. The default value is "True"
|
38 |
+
--energy_db_scale) shift; energy_db_scale=$1 ; shift ;;
|
39 |
+
|
40 |
+
# Model for computing speaker similarity. The default value is "wavlm"
|
41 |
+
--similarity_model) shift; similarity_model=$1 ; shift ;;
|
42 |
+
# Mode for computing speaker similarity. The default value is "pairwith"
|
43 |
+
--similarity_mode) shift; similarity_mode=$1 ; shift ;;
|
44 |
+
|
45 |
+
# Path for the transcript.
|
46 |
+
--ltr_path) shift; ltr_path=$1 ; shift ;;
|
47 |
+
# Mode for computing CER and WER. The default value is "gt_audio"
|
48 |
+
--intelligibility_mode) shift; intelligibility_mode=$1 ; shift ;;
|
49 |
+
# Language for computing CER and WER. The default value is "english"
|
50 |
+
--language) shift; language=$1 ; shift ;;
|
51 |
+
|
52 |
+
--) shift ; break ;;
|
53 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
54 |
+
esac
|
55 |
+
done
|
56 |
+
|
57 |
+
### Value check ###
|
58 |
+
if [ -z "$ref_dir" ]; then
|
59 |
+
echo "[Error] Please specify the reference_folder"
|
60 |
+
exit 1
|
61 |
+
fi
|
62 |
+
|
63 |
+
if [ -z "$deg_dir" ]; then
|
64 |
+
echo "[Error] Please specify the generated_folder"
|
65 |
+
exit 1
|
66 |
+
fi
|
67 |
+
|
68 |
+
if [ -z "$dump_dir" ]; then
|
69 |
+
echo "[Error] Please specify the dump_folder"
|
70 |
+
exit 1
|
71 |
+
fi
|
72 |
+
|
73 |
+
if [ -z "$metrics" ]; then
|
74 |
+
echo "[Error] Please specify the metrics"
|
75 |
+
exit 1
|
76 |
+
fi
|
77 |
+
|
78 |
+
if [ -z "$gpu" ]; then
|
79 |
+
gpu="0"
|
80 |
+
fi
|
81 |
+
|
82 |
+
if [ -z "$fs" ]; then
|
83 |
+
fs="None"
|
84 |
+
fi
|
85 |
+
|
86 |
+
if [ -z "$align_method" ]; then
|
87 |
+
align_method="dtw"
|
88 |
+
fi
|
89 |
+
|
90 |
+
if [ -z "$energy_db_scale" ]; then
|
91 |
+
energy_db_scale="True"
|
92 |
+
fi
|
93 |
+
|
94 |
+
if [ -z "$f0_subtract_mean" ]; then
|
95 |
+
f0_subtract_mean="True"
|
96 |
+
fi
|
97 |
+
|
98 |
+
if [ -z "$similarity_model" ]; then
|
99 |
+
similarity_model="wavlm"
|
100 |
+
fi
|
101 |
+
|
102 |
+
if [ -z "$similarity_mode" ]; then
|
103 |
+
similarity_mode="pairwith"
|
104 |
+
fi
|
105 |
+
|
106 |
+
if [ -z "$ltr_path" ]; then
|
107 |
+
ltr_path="None"
|
108 |
+
fi
|
109 |
+
|
110 |
+
if [ -z "$intelligibility_mode" ]; then
|
111 |
+
intelligibility_mode="gt_audio"
|
112 |
+
fi
|
113 |
+
|
114 |
+
if [ -z "$language" ]; then
|
115 |
+
language="english"
|
116 |
+
fi
|
117 |
+
|
118 |
+
######## Calculate Objective Metrics ###########
|
119 |
+
CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
|
120 |
+
--ref_dir $ref_dir \
|
121 |
+
--deg_dir $deg_dir \
|
122 |
+
--dump_dir $dump_dir \
|
123 |
+
--metrics $metrics \
|
124 |
+
--fs $fs \
|
125 |
+
--align_method $align_method \
|
126 |
+
--db_scale $energy_db_scale \
|
127 |
+
--f0_subtract_mean $f0_subtract_mean \
|
128 |
+
--similarity_model $similarity_model \
|
129 |
+
--similarity_mode $similarity_mode \
|
130 |
+
--ltr_path $ltr_path \
|
131 |
+
--intelligibility_mode $intelligibility_mode \
|
132 |
+
--language $language
|
egs/svc/DiffComoSVC/README.md
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
|
2 |
+
<br>
|
3 |
+
<div align="center">
|
4 |
+
<img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
|
5 |
+
</div>
|
6 |
+
<br>
|
7 |
+
|
8 |
+
This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop), only a slightly modification is applied on acoustic model. Specifically,
|
9 |
+
|
10 |
+
* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
|
11 |
+
* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
|
12 |
+
|
13 |
+
There are five stages in total:
|
14 |
+
|
15 |
+
1. Data preparation
|
16 |
+
2. Features extraction
|
17 |
+
3. Teacher Model Training
|
18 |
+
4. Consistency Distillation
|
19 |
+
5. Inference/conversion
|
20 |
+
|
21 |
+
## 1. Data Preparation
|
22 |
+
|
23 |
+
### Dataset Download
|
24 |
+
|
25 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
26 |
+
|
27 |
+
### Configuration
|
28 |
+
|
29 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
30 |
+
|
31 |
+
```json
|
32 |
+
"dataset": [
|
33 |
+
"m4singer",
|
34 |
+
"opencpop",
|
35 |
+
"opensinger",
|
36 |
+
"svcc",
|
37 |
+
"vctk"
|
38 |
+
],
|
39 |
+
"dataset_path": {
|
40 |
+
// TODO: Fill in your dataset path
|
41 |
+
"m4singer": "[M4Singer dataset path]",
|
42 |
+
"opencpop": "[Opencpop dataset path]",
|
43 |
+
"opensinger": "[OpenSinger dataset path]",
|
44 |
+
"svcc": "[SVCC dataset path]",
|
45 |
+
"vctk": "[VCTK dataset path]"
|
46 |
+
},
|
47 |
+
```
|
48 |
+
|
49 |
+
## 2. Features Extraction
|
50 |
+
|
51 |
+
### Content-based Pretrained Models Download
|
52 |
+
|
53 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
54 |
+
|
55 |
+
### Configuration
|
56 |
+
|
57 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
58 |
+
|
59 |
+
```json
|
60 |
+
// TODO: Fill in the output log path
|
61 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
62 |
+
"preprocess": {
|
63 |
+
// TODO: Fill in the output data path
|
64 |
+
"processed_dir": "[Your path to save processed data]",
|
65 |
+
...
|
66 |
+
},
|
67 |
+
```
|
68 |
+
|
69 |
+
### Run
|
70 |
+
|
71 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
72 |
+
|
73 |
+
```bash
|
74 |
+
cd Amphion
|
75 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 1
|
76 |
+
```
|
77 |
+
|
78 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
79 |
+
|
80 |
+
## 3. Teacher Model Training
|
81 |
+
|
82 |
+
### Configuration
|
83 |
+
|
84 |
+
Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
|
85 |
+
|
86 |
+
```JSON
|
87 |
+
"comosvc":{
|
88 |
+
"distill": false,
|
89 |
+
// conformer encoder
|
90 |
+
"input_dim": 384,
|
91 |
+
"output_dim": 100,
|
92 |
+
"n_heads": 2,
|
93 |
+
"n_layers": 6,
|
94 |
+
"filter_channels":512,
|
95 |
+
// karras diffusion
|
96 |
+
"P_mean": -1.2,
|
97 |
+
"P_std": 1.2,
|
98 |
+
"sigma_data": 0.5,
|
99 |
+
"sigma_min": 0.002,
|
100 |
+
"sigma_max": 80,
|
101 |
+
"rho": 7,
|
102 |
+
"n_timesteps": 40,
|
103 |
+
},
|
104 |
+
```
|
105 |
+
|
106 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
107 |
+
|
108 |
+
```json
|
109 |
+
"train": {
|
110 |
+
"batch_size": 32,
|
111 |
+
...
|
112 |
+
"adamw": {
|
113 |
+
"lr": 2.0e-4
|
114 |
+
},
|
115 |
+
...
|
116 |
+
}
|
117 |
+
```
|
118 |
+
|
119 |
+
### Run
|
120 |
+
|
121 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
122 |
+
|
123 |
+
```bash
|
124 |
+
cd Amphion
|
125 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
126 |
+
```
|
127 |
+
|
128 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
129 |
+
|
130 |
+
```bash
|
131 |
+
cd Amphion
|
132 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
133 |
+
```
|
134 |
+
|
135 |
+
## 4. Consistency Distillation
|
136 |
+
|
137 |
+
### Configuration
|
138 |
+
|
139 |
+
Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
|
140 |
+
|
141 |
+
```JSON
|
142 |
+
"model": {
|
143 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
144 |
+
...
|
145 |
+
"comosvc":{
|
146 |
+
"distill": true,
|
147 |
+
// conformer encoder
|
148 |
+
"input_dim": 384,
|
149 |
+
"output_dim": 100,
|
150 |
+
"n_heads": 2,
|
151 |
+
"n_layers": 6,
|
152 |
+
"filter_channels":512,
|
153 |
+
// karras diffusion
|
154 |
+
"P_mean": -1.2,
|
155 |
+
"P_std": 1.2,
|
156 |
+
"sigma_data": 0.5,
|
157 |
+
"sigma_min": 0.002,
|
158 |
+
"sigma_max": 80,
|
159 |
+
"rho": 7,
|
160 |
+
"n_timesteps": 40,
|
161 |
+
},
|
162 |
+
```
|
163 |
+
|
164 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
165 |
+
|
166 |
+
```json
|
167 |
+
"train": {
|
168 |
+
"batch_size": 32,
|
169 |
+
...
|
170 |
+
"adamw": {
|
171 |
+
"lr": 2.0e-4
|
172 |
+
},
|
173 |
+
...
|
174 |
+
}
|
175 |
+
```
|
176 |
+
|
177 |
+
### Run
|
178 |
+
|
179 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
180 |
+
|
181 |
+
```bash
|
182 |
+
cd Amphion
|
183 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
184 |
+
```
|
185 |
+
|
186 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
187 |
+
|
188 |
+
```bash
|
189 |
+
cd Amphion
|
190 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
191 |
+
```
|
192 |
+
|
193 |
+
## 5. Inference/Conversion
|
194 |
+
|
195 |
+
### Pretrained Vocoder Download
|
196 |
+
|
197 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
198 |
+
|
199 |
+
### Run
|
200 |
+
|
201 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
202 |
+
|
203 |
+
| Parameters | Description | Example |
|
204 |
+
| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
205 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
206 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
207 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
208 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
209 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
210 |
+
|
211 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
cd Amphion
|
215 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
|
216 |
+
--infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
|
217 |
+
--infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
|
218 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
219 |
+
--infer_target_speaker "opencpop_female1" \
|
220 |
+
--infer_key_shift "autoshift"
|
221 |
+
```
|
222 |
+
Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
|
223 |
+
```json
|
224 |
+
"inference": {
|
225 |
+
"comosvc": {
|
226 |
+
"inference_steps": 40
|
227 |
+
}
|
228 |
+
}
|
229 |
+
```
|
230 |
+
|
231 |
+
# Reference
|
232 |
+
https://github.com/zhenye234/CoMoSpeech
|
233 |
+
|
234 |
+
https://github.com/openai/consistency_models
|
egs/svc/DiffComoSVC/exp_config.json
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/comosvc.json",
|
3 |
+
"model_type": "DiffComoSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path
|
20 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path
|
23 |
+
"processed_dir": "[Your path to save processed data]",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
53 |
+
"condition_encoder": {
|
54 |
+
// Config for features usage
|
55 |
+
"use_whisper": true,
|
56 |
+
"use_contentvec": true,
|
57 |
+
"use_wenet": false,
|
58 |
+
"whisper_dim": 1024,
|
59 |
+
"contentvec_dim": 256,
|
60 |
+
"wenet_dim": 512,
|
61 |
+
"use_singer_encoder": false,
|
62 |
+
"pitch_min": 50,
|
63 |
+
"pitch_max": 1100
|
64 |
+
},
|
65 |
+
"comosvc":{
|
66 |
+
"distill": false,
|
67 |
+
// conformer encoder
|
68 |
+
"input_dim": 384,
|
69 |
+
"output_dim": 100,
|
70 |
+
"n_heads": 2,
|
71 |
+
"n_layers": 6,
|
72 |
+
"filter_channels":512,
|
73 |
+
"dropout":0.1,
|
74 |
+
// karras diffusion
|
75 |
+
"P_mean": -1.2,
|
76 |
+
"P_std": 1.2,
|
77 |
+
"sigma_data": 0.5,
|
78 |
+
"sigma_min": 0.002,
|
79 |
+
"sigma_max": 80,
|
80 |
+
"rho": 7,
|
81 |
+
"n_timesteps": 40,
|
82 |
+
},
|
83 |
+
"diffusion": {
|
84 |
+
// Diffusion steps encoder
|
85 |
+
"step_encoder": {
|
86 |
+
"dim_raw_embedding": 128,
|
87 |
+
"dim_hidden_layer": 512,
|
88 |
+
"activation": "SiLU",
|
89 |
+
"num_layer": 2,
|
90 |
+
"max_period": 10000
|
91 |
+
},
|
92 |
+
// Diffusion decoder
|
93 |
+
"model_type": "bidilconv",
|
94 |
+
// bidilconv, unet2d, TODO: unet1d
|
95 |
+
"bidilconv": {
|
96 |
+
"base_channel": 384,
|
97 |
+
"n_res_block": 20,
|
98 |
+
"conv_kernel_size": 3,
|
99 |
+
"dilation_cycle_length": 4,
|
100 |
+
// specially, 1 means no dilation
|
101 |
+
"conditioner_size": 100
|
102 |
+
}
|
103 |
+
}
|
104 |
+
},
|
105 |
+
"train": {
|
106 |
+
"batch_size": 64,
|
107 |
+
"gradient_accumulation_step": 1,
|
108 |
+
"max_epoch": -1, // -1 means no limit
|
109 |
+
"save_checkpoint_stride": [
|
110 |
+
50,
|
111 |
+
50
|
112 |
+
],
|
113 |
+
"keep_last": [
|
114 |
+
5,
|
115 |
+
-1
|
116 |
+
],
|
117 |
+
"run_eval": [
|
118 |
+
false,
|
119 |
+
true
|
120 |
+
],
|
121 |
+
"adamw": {
|
122 |
+
"lr": 4.0e-4
|
123 |
+
},
|
124 |
+
"reducelronplateau": {
|
125 |
+
"factor": 0.8,
|
126 |
+
"patience": 10,
|
127 |
+
"min_lr": 1.0e-4
|
128 |
+
},
|
129 |
+
"dataloader": {
|
130 |
+
"num_worker": 8,
|
131 |
+
"pin_memory": true
|
132 |
+
},
|
133 |
+
"sampler": {
|
134 |
+
"holistic_shuffle": false,
|
135 |
+
"drop_last": true
|
136 |
+
}
|
137 |
+
},
|
138 |
+
"inference": {
|
139 |
+
"comosvc": {
|
140 |
+
"inference_steps": 40
|
141 |
+
}
|
142 |
+
}
|
143 |
+
}
|
egs/svc/MultipleContentsSVC/README.md
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
|
2 |
+
|
3 |
+
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
|
4 |
+
[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
|
5 |
+
[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/singing_voice_conversion)
|
6 |
+
[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion)
|
7 |
+
[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion)
|
8 |
+
|
9 |
+
<br>
|
10 |
+
<div align="center">
|
11 |
+
<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
|
12 |
+
</div>
|
13 |
+
<br>
|
14 |
+
|
15 |
+
This is the official implementation of the paper "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop). Specially,
|
16 |
+
|
17 |
+
- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
|
18 |
+
- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
19 |
+
- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
|
20 |
+
|
21 |
+
## A Little Taste Before Getting Started
|
22 |
+
|
23 |
+
Before you delve into the code, we suggest exploring the interactive DEMO we've provided for a comprehensive overview. There are several ways you can engage with it:
|
24 |
+
|
25 |
+
1. **Online DEMO**
|
26 |
+
|
27 |
+
| HuggingFace | OpenXLab |
|
28 |
+
| :----------------------------------------------------------: | :----------------------------------------------------------: |
|
29 |
+
| [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion)<br />(Worldwide) | [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion)<br />(Suitable for Mainland China Users) |
|
30 |
+
|
31 |
+
2. **Run Local Gradio DEMO**
|
32 |
+
|
33 |
+
| Run with Docker | Duplicate Space with Private GPU |
|
34 |
+
| :----------------------------------------------------------: | :----------------------------------------------------------: |
|
35 |
+
| [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?docker=true) | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?duplicate=true) |
|
36 |
+
|
37 |
+
3. **Run with the Extended Colab**
|
38 |
+
|
39 |
+
You can check out [this repo](https://github.com/camenduru/singing-voice-conversion-colab) to run it with Colab. Thanks to [@camenduru](https://x.com/camenduru?s=20) and the community for their support!
|
40 |
+
|
41 |
+
## Usage Overview
|
42 |
+
|
43 |
+
To train a `DiffWaveNetSVC` model, there are four stages in total:
|
44 |
+
|
45 |
+
1. Data preparation
|
46 |
+
2. Features extraction
|
47 |
+
3. Training
|
48 |
+
4. Inference/conversion
|
49 |
+
|
50 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
51 |
+
> ```bash
|
52 |
+
> cd Amphion
|
53 |
+
> ```
|
54 |
+
|
55 |
+
## 1. Data Preparation
|
56 |
+
|
57 |
+
### Dataset Download
|
58 |
+
|
59 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
60 |
+
|
61 |
+
### Configuration
|
62 |
+
|
63 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
64 |
+
|
65 |
+
```json
|
66 |
+
"dataset": [
|
67 |
+
"m4singer",
|
68 |
+
"opencpop",
|
69 |
+
"opensinger",
|
70 |
+
"svcc",
|
71 |
+
"vctk"
|
72 |
+
],
|
73 |
+
"dataset_path": {
|
74 |
+
// TODO: Fill in your dataset path
|
75 |
+
"m4singer": "[M4Singer dataset path]",
|
76 |
+
"opencpop": "[Opencpop dataset path]",
|
77 |
+
"opensinger": "[OpenSinger dataset path]",
|
78 |
+
"svcc": "[SVCC dataset path]",
|
79 |
+
"vctk": "[VCTK dataset path]"
|
80 |
+
},
|
81 |
+
```
|
82 |
+
|
83 |
+
### Custom Dataset
|
84 |
+
|
85 |
+
We support custom dataset, see [here](../../datasets/README.md#customsvcdataset) for the file structure to follow.
|
86 |
+
|
87 |
+
After constructing proper file structure, specify your dataset name in `dataset` and its path in `dataset_path`, also add its name in `use_custom_dataset`:
|
88 |
+
|
89 |
+
```json
|
90 |
+
"dataset": [
|
91 |
+
"[Exisiting Dataset Name]",
|
92 |
+
//...
|
93 |
+
"[Your Custom Dataset Name]"
|
94 |
+
],
|
95 |
+
"dataset_path": {
|
96 |
+
"[Exisiting Dataset Name]": "[Exisiting Dataset Path]",
|
97 |
+
//...
|
98 |
+
"[Your Custom Dataset Name]": "[Your Custom Dataset Path]"
|
99 |
+
},
|
100 |
+
"use_custom_dataset": [
|
101 |
+
"[Your Custom Dataset Name]"
|
102 |
+
],
|
103 |
+
```
|
104 |
+
|
105 |
+
> **NOTE:** Custom dataset name does not have to be the same as the folder name. But it needs to satisfy these rules:
|
106 |
+
> 1. It can not be the same as the exisiting dataset name.
|
107 |
+
> 2. It can not contain any space or underline(`_`).
|
108 |
+
> 3. It must be a valid folder name for operating system.
|
109 |
+
>
|
110 |
+
> Some examples of valid custom dataset names are `mydataset`, `myDataset`, `my-dataset`, `mydataset1`, `my-dataset-1`, etc.
|
111 |
+
|
112 |
+
## 2. Features Extraction
|
113 |
+
|
114 |
+
### Content-based Pretrained Models Download
|
115 |
+
|
116 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
117 |
+
|
118 |
+
### Configuration
|
119 |
+
|
120 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
121 |
+
|
122 |
+
```json
|
123 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
124 |
+
"log_dir": "ckpts/svc",
|
125 |
+
"preprocess": {
|
126 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
127 |
+
"processed_dir": "data",
|
128 |
+
...
|
129 |
+
},
|
130 |
+
```
|
131 |
+
|
132 |
+
### Run
|
133 |
+
|
134 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
135 |
+
|
136 |
+
```bash
|
137 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 1
|
138 |
+
```
|
139 |
+
|
140 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
141 |
+
|
142 |
+
## 3. Training
|
143 |
+
|
144 |
+
### Configuration
|
145 |
+
|
146 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
147 |
+
|
148 |
+
```json
|
149 |
+
"train": {
|
150 |
+
"batch_size": 32,
|
151 |
+
...
|
152 |
+
"adamw": {
|
153 |
+
"lr": 2.0e-4
|
154 |
+
},
|
155 |
+
...
|
156 |
+
}
|
157 |
+
```
|
158 |
+
|
159 |
+
### Train From Scratch
|
160 |
+
|
161 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
162 |
+
|
163 |
+
```bash
|
164 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
|
165 |
+
```
|
166 |
+
|
167 |
+
### Train From Existing Source
|
168 |
+
|
169 |
+
We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint.
|
170 |
+
|
171 |
+
Setting `--resume true`, the training will resume from the **latest checkpoint** by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/svc/[YourExptName]/checkpoint`, run:
|
172 |
+
|
173 |
+
```bash
|
174 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
|
175 |
+
--resume true
|
176 |
+
```
|
177 |
+
|
178 |
+
You can choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run:
|
179 |
+
|
180 |
+
```bash
|
181 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
|
182 |
+
--resume true
|
183 |
+
--resume_from_ckpt_path "Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]" \
|
184 |
+
```
|
185 |
+
|
186 |
+
If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run:
|
187 |
+
|
188 |
+
```bash
|
189 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
|
190 |
+
--resume true
|
191 |
+
--resume_from_ckpt_path "Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]" \
|
192 |
+
--resume_type "finetune"
|
193 |
+
```
|
194 |
+
|
195 |
+
> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training.
|
196 |
+
>
|
197 |
+
> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint.
|
198 |
+
|
199 |
+
Here are some example scenarios to better understand how to use these arguments:
|
200 |
+
| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` |
|
201 |
+
| ------ | -------- | ----------------------- | ------------- |
|
202 |
+
| You want to train from scratch | no | no | no |
|
203 |
+
| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no |
|
204 |
+
| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no |
|
205 |
+
| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` |
|
206 |
+
|
207 |
+
|
208 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
209 |
+
|
210 |
+
## 4. Inference/Conversion
|
211 |
+
|
212 |
+
### Pretrained Vocoder Download
|
213 |
+
|
214 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
215 |
+
|
216 |
+
### Run
|
217 |
+
|
218 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
219 |
+
|
220 |
+
| Parameters | Description | Example |
|
221 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
222 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
223 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
224 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
225 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
226 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
227 |
+
|
228 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
229 |
+
|
230 |
+
```bash
|
231 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
|
232 |
+
--infer_expt_dir ckpts/svc/[YourExptName] \
|
233 |
+
--infer_output_dir ckpts/svc/[YourExptName]/result \
|
234 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
235 |
+
--infer_target_speaker "opencpop_female1" \
|
236 |
+
--infer_key_shift "autoshift"
|
237 |
+
```
|
238 |
+
|
239 |
+
## Citations
|
240 |
+
|
241 |
+
```bibtex
|
242 |
+
@inproceedings{zhang2024leveraging,
|
243 |
+
author={Zhang, Xueyao and Fang, Zihao and Gu, Yicheng and Chen, Haopeng and Zou, Lexiao and Zhang, Junan and Xue, Liumeng and Wu, Zhizheng},
|
244 |
+
title={Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion},
|
245 |
+
booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024},
|
246 |
+
year={2024}
|
247 |
+
}
|
248 |
+
```
|
egs/svc/MultipleContentsSVC/exp_config.json
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/svc/diffusion.json",
|
3 |
+
"model_type": "DiffWaveNetSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online")
|
26 |
+
"extract_mel": true,
|
27 |
+
"extract_pitch": true,
|
28 |
+
"extract_energy": true,
|
29 |
+
"extract_whisper_feature": true,
|
30 |
+
"extract_contentvec_feature": true,
|
31 |
+
"extract_wenet_feature": false,
|
32 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
33 |
+
"contentvec_batch_size": 1,
|
34 |
+
// Fill in the content-based pretrained model's path
|
35 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
36 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
37 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
38 |
+
"whisper_model": "medium",
|
39 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
40 |
+
// Config for features usage
|
41 |
+
"use_mel": true,
|
42 |
+
"use_min_max_norm_mel": true,
|
43 |
+
"use_frame_pitch": true,
|
44 |
+
"use_frame_energy": true,
|
45 |
+
"use_spkid": true,
|
46 |
+
"use_whisper": true,
|
47 |
+
"use_contentvec": true,
|
48 |
+
"use_wenet": false,
|
49 |
+
"n_mel": 100,
|
50 |
+
"sample_rate": 24000
|
51 |
+
},
|
52 |
+
"model": {
|
53 |
+
"condition_encoder": {
|
54 |
+
// Config for features usage
|
55 |
+
"use_whisper": true,
|
56 |
+
"use_contentvec": true,
|
57 |
+
"use_wenet": false,
|
58 |
+
"whisper_dim": 1024,
|
59 |
+
"contentvec_dim": 256,
|
60 |
+
"wenet_dim": 512,
|
61 |
+
"use_singer_encoder": false,
|
62 |
+
"pitch_min": 50,
|
63 |
+
"pitch_max": 1100
|
64 |
+
},
|
65 |
+
"diffusion": {
|
66 |
+
"scheduler": "ddpm",
|
67 |
+
"scheduler_settings": {
|
68 |
+
"num_train_timesteps": 1000,
|
69 |
+
"beta_start": 1.0e-4,
|
70 |
+
"beta_end": 0.02,
|
71 |
+
"beta_schedule": "linear"
|
72 |
+
},
|
73 |
+
// Diffusion steps encoder
|
74 |
+
"step_encoder": {
|
75 |
+
"dim_raw_embedding": 128,
|
76 |
+
"dim_hidden_layer": 512,
|
77 |
+
"activation": "SiLU",
|
78 |
+
"num_layer": 2,
|
79 |
+
"max_period": 10000
|
80 |
+
},
|
81 |
+
// Diffusion decoder
|
82 |
+
"model_type": "bidilconv",
|
83 |
+
// bidilconv, unet2d, TODO: unet1d
|
84 |
+
"bidilconv": {
|
85 |
+
"base_channel": 512,
|
86 |
+
"n_res_block": 40,
|
87 |
+
"conv_kernel_size": 3,
|
88 |
+
"dilation_cycle_length": 4,
|
89 |
+
// specially, 1 means no dilation
|
90 |
+
"conditioner_size": 384
|
91 |
+
}
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"train": {
|
95 |
+
"batch_size": 32,
|
96 |
+
"gradient_accumulation_step": 1,
|
97 |
+
"max_epoch": -1, // -1 means no limit
|
98 |
+
"save_checkpoint_stride": [
|
99 |
+
3,
|
100 |
+
50
|
101 |
+
],
|
102 |
+
"keep_last": [
|
103 |
+
3,
|
104 |
+
2
|
105 |
+
],
|
106 |
+
"run_eval": [
|
107 |
+
true,
|
108 |
+
true
|
109 |
+
],
|
110 |
+
"adamw": {
|
111 |
+
"lr": 2.0e-4
|
112 |
+
},
|
113 |
+
"reducelronplateau": {
|
114 |
+
"factor": 0.8,
|
115 |
+
"patience": 30,
|
116 |
+
"min_lr": 1.0e-4
|
117 |
+
},
|
118 |
+
"dataloader": {
|
119 |
+
"num_worker": 8,
|
120 |
+
"pin_memory": true
|
121 |
+
},
|
122 |
+
"sampler": {
|
123 |
+
"holistic_shuffle": false,
|
124 |
+
"drop_last": true
|
125 |
+
}
|
126 |
+
}
|
127 |
+
}
|
egs/svc/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Singing Voice Conversion (SVC) Recipe
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
|
6 |
+
|
7 |
+
## Supported Model Architectures
|
8 |
+
|
9 |
+
The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
|
10 |
+
|
11 |
+
<br>
|
12 |
+
<div align="center">
|
13 |
+
<img src="../../imgs/svc/pipeline.png" width="70%">
|
14 |
+
</div>
|
15 |
+
<br>
|
16 |
+
|
17 |
+
Until now, Amphion SVC has supported the following features and models:
|
18 |
+
|
19 |
+
- **Speaker-agnostic Representations**:
|
20 |
+
- Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
|
21 |
+
- Prosody Features: F0 and energy.
|
22 |
+
- **Speaker Embeddings**:
|
23 |
+
- Speaker Look-Up Table.
|
24 |
+
- Reference Encoder (👨💻 developing): It can be used for zero-shot SVC.
|
25 |
+
- **Acoustic Decoders**:
|
26 |
+
- Diffusion-based models:
|
27 |
+
- **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
28 |
+
- **[DiffComoSVC](DiffComoSVC)** (👨💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
|
29 |
+
- Transformer-based models:
|
30 |
+
- **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
|
31 |
+
- VAE- and Flow-based models:
|
32 |
+
- **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
|
33 |
+
- **Waveform Synthesizers (Vocoders)**:
|
34 |
+
- The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
|
egs/svc/TransformerSVC/README.md
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Transformer for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/TransformerSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
|
79 |
+
```json
|
80 |
+
"model": {
|
81 |
+
...
|
82 |
+
"transformer":{
|
83 |
+
// 'conformer' or 'transformer'
|
84 |
+
"type": "conformer",
|
85 |
+
"input_dim": 384,
|
86 |
+
"output_dim": 100,
|
87 |
+
"n_heads": 2,
|
88 |
+
"n_layers": 6,
|
89 |
+
"filter_channels":512,
|
90 |
+
"dropout":0.1,
|
91 |
+
}
|
92 |
+
}
|
93 |
+
```
|
94 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
95 |
+
|
96 |
+
```json
|
97 |
+
"train": {
|
98 |
+
"batch_size": 32,
|
99 |
+
...
|
100 |
+
"adamw": {
|
101 |
+
"lr": 2.0e-4
|
102 |
+
},
|
103 |
+
...
|
104 |
+
}
|
105 |
+
```
|
106 |
+
|
107 |
+
### Run
|
108 |
+
|
109 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
110 |
+
|
111 |
+
```bash
|
112 |
+
sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
|
113 |
+
```
|
114 |
+
|
115 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
116 |
+
|
117 |
+
## 4. Inference/Conversion
|
118 |
+
|
119 |
+
### Pretrained Vocoder Download
|
120 |
+
|
121 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
122 |
+
|
123 |
+
### Run
|
124 |
+
|
125 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
126 |
+
|
127 |
+
| Parameters | Description | Example |
|
128 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
129 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
130 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
131 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
132 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
133 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
134 |
+
|
135 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
136 |
+
|
137 |
+
```bash
|
138 |
+
cd Amphion
|
139 |
+
sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
|
140 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
141 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
142 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
143 |
+
--infer_target_speaker "opencpop_female1" \
|
144 |
+
--infer_key_shift "autoshift"
|
145 |
+
```
|
146 |
+
|
147 |
+
## Citations
|
148 |
+
|
149 |
+
```bibtex
|
150 |
+
@inproceedings{transformer,
|
151 |
+
author = {Ashish Vaswani and
|
152 |
+
Noam Shazeer and
|
153 |
+
Niki Parmar and
|
154 |
+
Jakob Uszkoreit and
|
155 |
+
Llion Jones and
|
156 |
+
Aidan N. Gomez and
|
157 |
+
Lukasz Kaiser and
|
158 |
+
Illia Polosukhin},
|
159 |
+
title = {Attention is All you Need},
|
160 |
+
booktitle = {{NIPS}},
|
161 |
+
pages = {5998--6008},
|
162 |
+
year = {2017}
|
163 |
+
}
|
164 |
+
```
|
egs/svc/TransformerSVC/exp_config.json
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/transformer.json",
|
3 |
+
"model_type": "TransformerSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"condition_encoder": {
|
53 |
+
// Config for features usage
|
54 |
+
"use_whisper": true,
|
55 |
+
"use_contentvec": true,
|
56 |
+
"use_wenet": false,
|
57 |
+
"whisper_dim": 1024,
|
58 |
+
"contentvec_dim": 256,
|
59 |
+
"wenet_dim": 512,
|
60 |
+
"use_singer_encoder": false,
|
61 |
+
"pitch_min": 50,
|
62 |
+
"pitch_max": 1100
|
63 |
+
},
|
64 |
+
"transformer": {
|
65 |
+
// 'conformer' or 'transformer'
|
66 |
+
"type": "conformer",
|
67 |
+
"input_dim": 384,
|
68 |
+
"output_dim": 100,
|
69 |
+
"n_heads": 2,
|
70 |
+
"n_layers": 6,
|
71 |
+
"filter_channels": 512,
|
72 |
+
"dropout": 0.1,
|
73 |
+
}
|
74 |
+
},
|
75 |
+
"train": {
|
76 |
+
"batch_size": 64,
|
77 |
+
"gradient_accumulation_step": 1,
|
78 |
+
"max_epoch": -1, // -1 means no limit
|
79 |
+
"save_checkpoint_stride": [
|
80 |
+
50,
|
81 |
+
50
|
82 |
+
],
|
83 |
+
"keep_last": [
|
84 |
+
5,
|
85 |
+
-1
|
86 |
+
],
|
87 |
+
"run_eval": [
|
88 |
+
false,
|
89 |
+
true
|
90 |
+
],
|
91 |
+
"adamw": {
|
92 |
+
"lr": 4.0e-4
|
93 |
+
},
|
94 |
+
"reducelronplateau": {
|
95 |
+
"factor": 0.8,
|
96 |
+
"patience": 10,
|
97 |
+
"min_lr": 1.0e-4
|
98 |
+
},
|
99 |
+
"dataloader": {
|
100 |
+
"num_worker": 8,
|
101 |
+
"pin_memory": true
|
102 |
+
},
|
103 |
+
"sampler": {
|
104 |
+
"holistic_shuffle": false,
|
105 |
+
"drop_last": true
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
egs/svc/VitsSVC/README.md
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VITS for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/VitsSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
|
79 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
80 |
+
|
81 |
+
```json
|
82 |
+
"train": {
|
83 |
+
"batch_size": 32,
|
84 |
+
...
|
85 |
+
"adamw": {
|
86 |
+
"lr": 2.0e-4
|
87 |
+
},
|
88 |
+
...
|
89 |
+
}
|
90 |
+
```
|
91 |
+
|
92 |
+
### Run
|
93 |
+
|
94 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
95 |
+
|
96 |
+
```bash
|
97 |
+
sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
|
98 |
+
```
|
99 |
+
|
100 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
101 |
+
|
102 |
+
## 4. Inference/Conversion
|
103 |
+
|
104 |
+
### Run
|
105 |
+
|
106 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
107 |
+
|
108 |
+
| Parameters | Description | Example |
|
109 |
+
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
110 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
111 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
112 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
113 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
114 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
115 |
+
|
116 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
117 |
+
|
118 |
+
```bash
|
119 |
+
sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
|
120 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
121 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
122 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
123 |
+
--infer_target_speaker "opencpop_female1" \
|
124 |
+
--infer_key_shift "autoshift"
|
125 |
+
```
|
egs/svc/VitsSVC/exp_config.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/vitssvc.json",
|
3 |
+
"model_type": "VitsSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
"use_custom_dataset": [],
|
20 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
21 |
+
"log_dir": "ckpts/svc",
|
22 |
+
"preprocess": {
|
23 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
24 |
+
"processed_dir": "data",
|
25 |
+
|
26 |
+
"n_mel": 100,
|
27 |
+
"sample_rate": 24000,
|
28 |
+
|
29 |
+
// contentvec
|
30 |
+
"extract_contentvec_feature": true,
|
31 |
+
"contentvec_sample_rate": 16000,
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
"contentvec_frameshift": 0.02,
|
34 |
+
// whisper
|
35 |
+
"extract_whisper_feature": true,
|
36 |
+
"whisper_sample_rate": 16000,
|
37 |
+
"whisper_frameshift": 0.01,
|
38 |
+
"whisper_downsample_rate": 2,
|
39 |
+
// wenet
|
40 |
+
"extract_wenet_feature": true,
|
41 |
+
"wenet_downsample_rate": 4,
|
42 |
+
"wenet_frameshift": 0.01,
|
43 |
+
"wenet_sample_rate": 16000,
|
44 |
+
// Fill in the content-based pretrained model's path
|
45 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
46 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
47 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
48 |
+
"whisper_model": "medium",
|
49 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
50 |
+
|
51 |
+
"use_contentvec": true,
|
52 |
+
"use_whisper": true,
|
53 |
+
"use_wenet": false,
|
54 |
+
|
55 |
+
// Extract content features using dataloader
|
56 |
+
"pin_memory": true,
|
57 |
+
"num_workers": 8,
|
58 |
+
"content_feature_batch_size": 16,
|
59 |
+
|
60 |
+
},
|
61 |
+
"model": {
|
62 |
+
"condition_encoder": {
|
63 |
+
// Config for features usage
|
64 |
+
"merge_mode": "add",
|
65 |
+
"use_log_loudness": true,
|
66 |
+
"use_contentvec": true,
|
67 |
+
"use_whisper": true,
|
68 |
+
"use_wenet": false,
|
69 |
+
"whisper_dim": 1024,
|
70 |
+
"contentvec_dim": 256,
|
71 |
+
"wenet_dim": 512,
|
72 |
+
},
|
73 |
+
"vits": {
|
74 |
+
"inter_channels": 384,
|
75 |
+
"hidden_channels": 384,
|
76 |
+
"filter_channels": 256,
|
77 |
+
"n_heads": 2,
|
78 |
+
"n_layers": 6,
|
79 |
+
"kernel_size": 3,
|
80 |
+
"p_dropout": 0.1,
|
81 |
+
"n_flow_layer": 4,
|
82 |
+
"n_layers_q": 3,
|
83 |
+
"gin_channels": 256,
|
84 |
+
"n_speakers": 512,
|
85 |
+
"use_spectral_norm": false,
|
86 |
+
},
|
87 |
+
"generator": "nsfhifigan",
|
88 |
+
},
|
89 |
+
"train": {
|
90 |
+
"batch_size": 32,
|
91 |
+
"learning_rate": 2e-4,
|
92 |
+
"gradient_accumulation_step": 1,
|
93 |
+
"max_epoch": -1, // -1 means no limit
|
94 |
+
"save_checkpoint_stride": [
|
95 |
+
3,
|
96 |
+
50
|
97 |
+
],
|
98 |
+
"keep_last": [
|
99 |
+
3,
|
100 |
+
2
|
101 |
+
],
|
102 |
+
},
|
103 |
+
"inference": {
|
104 |
+
"batch_size": 1,
|
105 |
+
}
|
106 |
+
}
|