|
import os |
|
|
|
import torch |
|
from trainer import Trainer, TrainerArgs |
|
|
|
from TTS.bin.compute_embeddings import compute_embeddings |
|
from TTS.bin.resample import resample_files |
|
from TTS.config.shared_configs import BaseDatasetConfig |
|
from TTS.tts.configs.vits_config import VitsConfig |
|
from TTS.tts.datasets import load_tts_samples |
|
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig |
|
from TTS.utils.downloaders import download_vctk |
|
|
|
torch.set_num_threads(24) |
|
|
|
|
|
""" |
|
This recipe replicates the first experiment proposed in the YourTTS paper (https://arxiv.org/abs/2112.02418). |
|
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes. |
|
In addition, YourTTS can be trained in multilingual data, however, this recipe replicates the single language training using the VCTK dataset. |
|
If you are interested in multilingual training, we have commented on parameters on the VitsArgs class instance that should be enabled for multilingual training. |
|
In addition, you will need to add the extra datasets following the VCTK as an example. |
|
""" |
|
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
RUN_NAME = "YourTTS-EN-VCTK" |
|
|
|
|
|
OUT_PATH = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
RESTORE_PATH = None |
|
|
|
|
|
SKIP_TRAIN_EPOCH = False |
|
|
|
|
|
BATCH_SIZE = 32 |
|
|
|
|
|
|
|
SAMPLE_RATE = 16000 |
|
|
|
|
|
MAX_AUDIO_LEN_IN_SECONDS = 10 |
|
|
|
|
|
VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK") |
|
|
|
NUM_RESAMPLE_THREADS = 10 |
|
|
|
if not os.path.exists(VCTK_DOWNLOAD_PATH): |
|
print(">>> Downloading VCTK dataset:") |
|
download_vctk(VCTK_DOWNLOAD_PATH) |
|
resample_files(VCTK_DOWNLOAD_PATH, SAMPLE_RATE, file_ext="flac", n_jobs=NUM_RESAMPLE_THREADS) |
|
|
|
|
|
vctk_config = BaseDatasetConfig( |
|
formatter="vctk", |
|
dataset_name="vctk", |
|
meta_file_train="", |
|
meta_file_val="", |
|
path=VCTK_DOWNLOAD_PATH, |
|
language="en", |
|
ignored_speakers=[ |
|
"p261", |
|
"p225", |
|
"p294", |
|
"p347", |
|
"p238", |
|
"p234", |
|
"p248", |
|
"p335", |
|
"p245", |
|
"p326", |
|
"p302", |
|
], |
|
) |
|
|
|
|
|
DATASETS_CONFIG_LIST = [vctk_config] |
|
|
|
|
|
SPEAKER_ENCODER_CHECKPOINT_PATH = ( |
|
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar" |
|
) |
|
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json" |
|
|
|
D_VECTOR_FILES = [] |
|
|
|
|
|
for dataset_conf in DATASETS_CONFIG_LIST: |
|
|
|
embeddings_file = os.path.join(dataset_conf.path, "speakers.pth") |
|
if not os.path.isfile(embeddings_file): |
|
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset") |
|
compute_embeddings( |
|
SPEAKER_ENCODER_CHECKPOINT_PATH, |
|
SPEAKER_ENCODER_CONFIG_PATH, |
|
embeddings_file, |
|
old_spakers_file=None, |
|
config_dataset_path=None, |
|
formatter_name=dataset_conf.formatter, |
|
dataset_name=dataset_conf.dataset_name, |
|
dataset_path=dataset_conf.path, |
|
meta_file_train=dataset_conf.meta_file_train, |
|
meta_file_val=dataset_conf.meta_file_val, |
|
disable_cuda=False, |
|
no_eval=False, |
|
) |
|
D_VECTOR_FILES.append(embeddings_file) |
|
|
|
|
|
|
|
audio_config = VitsAudioConfig( |
|
sample_rate=SAMPLE_RATE, |
|
hop_length=256, |
|
win_length=1024, |
|
fft_size=1024, |
|
mel_fmin=0.0, |
|
mel_fmax=None, |
|
num_mels=80, |
|
) |
|
|
|
|
|
model_args = VitsArgs( |
|
d_vector_file=D_VECTOR_FILES, |
|
use_d_vector_file=True, |
|
d_vector_dim=512, |
|
num_layers_text_encoder=10, |
|
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH, |
|
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH, |
|
resblock_type_decoder="2", |
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
config = VitsConfig( |
|
output_path=OUT_PATH, |
|
model_args=model_args, |
|
run_name=RUN_NAME, |
|
project_name="YourTTS", |
|
run_description=""" |
|
- Original YourTTS trained using VCTK dataset |
|
""", |
|
dashboard_logger="tensorboard", |
|
logger_uri=None, |
|
audio=audio_config, |
|
batch_size=BATCH_SIZE, |
|
batch_group_size=48, |
|
eval_batch_size=BATCH_SIZE, |
|
num_loader_workers=8, |
|
eval_split_max_size=256, |
|
print_step=50, |
|
plot_step=100, |
|
log_model_step=1000, |
|
save_step=5000, |
|
save_n_checkpoints=2, |
|
save_checkpoints=True, |
|
target_loss="loss_1", |
|
print_eval=False, |
|
use_phonemes=False, |
|
phonemizer="espeak", |
|
phoneme_language="en", |
|
compute_input_seq_cache=True, |
|
add_blank=True, |
|
text_cleaner="multilingual_cleaners", |
|
characters=CharactersConfig( |
|
characters_class="TTS.tts.models.vits.VitsCharacters", |
|
pad="_", |
|
eos="&", |
|
bos="*", |
|
blank=None, |
|
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ", |
|
punctuations="!'(),-.:;? ", |
|
phonemes="", |
|
is_unique=True, |
|
is_sorted=True, |
|
), |
|
phoneme_cache_path=None, |
|
precompute_num_workers=12, |
|
start_by_longest=True, |
|
datasets=DATASETS_CONFIG_LIST, |
|
cudnn_benchmark=False, |
|
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS, |
|
mixed_precision=False, |
|
test_sentences=[ |
|
[ |
|
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", |
|
"VCTK_p277", |
|
None, |
|
"en", |
|
], |
|
[ |
|
"Be a voice, not an echo.", |
|
"VCTK_p239", |
|
None, |
|
"en", |
|
], |
|
[ |
|
"I'm sorry Dave. I'm afraid I can't do that.", |
|
"VCTK_p258", |
|
None, |
|
"en", |
|
], |
|
[ |
|
"This cake is great. It's so delicious and moist.", |
|
"VCTK_p244", |
|
None, |
|
"en", |
|
], |
|
[ |
|
"Prior to November 22, 1963.", |
|
"VCTK_p305", |
|
None, |
|
"en", |
|
], |
|
], |
|
|
|
use_weighted_sampler=True, |
|
|
|
weighted_sampler_attrs={"speaker_name": 1.0}, |
|
weighted_sampler_multipliers={}, |
|
|
|
speaker_encoder_loss_alpha=9.0, |
|
) |
|
|
|
|
|
train_samples, eval_samples = load_tts_samples( |
|
config.datasets, |
|
eval_split=True, |
|
eval_split_max_size=config.eval_split_max_size, |
|
eval_split_size=config.eval_split_size, |
|
) |
|
|
|
|
|
model = Vits.init_from_config(config) |
|
|
|
|
|
trainer = Trainer( |
|
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH), |
|
config, |
|
output_path=OUT_PATH, |
|
model=model, |
|
train_samples=train_samples, |
|
eval_samples=eval_samples, |
|
) |
|
trainer.fit() |
|
|