Spaces:
Sleeping
Sleeping
import time | |
import torch.multiprocessing | |
import wandb | |
from torch.utils.data import ConcatDataset | |
from Architectures.ToucanTTS.ToucanTTS import ToucanTTS | |
from Architectures.ToucanTTS.toucantts_train_loop_arbiter import train_loop | |
from Utility.corpus_preparation import prepare_tts_corpus | |
from Utility.path_to_transcript_dicts import * | |
from Utility.storage_config import MODELS_DIR | |
from Utility.storage_config import PREPROCESSING_DIR | |
def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): | |
# It is not recommended training this yourself or to finetune this, but you can. | |
# The recommended use is to download the pretrained model from the GitHub release | |
# page and finetune to your desired data | |
datasets = list() | |
base_dir = os.path.join(MODELS_DIR, "ToucanTTS_Meta") | |
if model_dir is not None: | |
meta_save_dir = model_dir | |
else: | |
meta_save_dir = base_dir | |
os.makedirs(meta_save_dir, exist_ok=True) | |
print("Preparing") | |
if gpu_count > 1: | |
rank = int(os.environ["LOCAL_RANK"]) | |
torch.cuda.set_device(rank) | |
torch.distributed.init_process_group(backend="nccl") | |
else: | |
rank = 0 | |
english_datasets = list() | |
german_datasets = list() | |
greek_datasets = list() | |
spanish_datasets = list() | |
finnish_datasets = list() | |
russian_datasets = list() | |
hungarian_datasets = list() | |
dutch_datasets = list() | |
french_datasets = list() | |
portuguese_datasets = list() | |
polish_datasets = list() | |
italian_datasets = list() | |
chinese_datasets = list() | |
vietnamese_datasets = list() | |
chunk_count = 50 | |
chunks = split_dictionary_into_chunks(build_path_to_transcript_dict_mls_english(), split_n=chunk_count) | |
for index in range(chunk_count): | |
english_datasets.append(prepare_tts_corpus(transcript_dict=chunks[index], | |
corpus_dir=os.path.join(PREPROCESSING_DIR, f"mls_english_chunk_{index}"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_nancy, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Nancy"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_ryanspeech, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Ryan"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_ljspeech, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "LJSpeech"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_libritts_all_clean, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "libri_all_clean"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_vctk, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "vctk"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_nvidia_hifitts, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "hifi"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_CREMA_D, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "cremad"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_EmoV_DB, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "emovdb"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_RAVDESS, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "ravdess"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_ESDS, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "esds"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_blizzard_2013, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "blizzard2013"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_jenny, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "jenny"), | |
lang="eng", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# GERMAN | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_karlsson, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Karlsson"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_eva, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Eva"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_hokus, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Hokus"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_bernd, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Bernd"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_friedrich, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "Friedrich"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_hui_others, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "hui_others"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_thorsten_emotional(), | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "thorsten_emotional"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_thorsten_neutral(), | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "thorsten_neutral"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_thorsten_2022_10(), | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "thorsten_2022"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
chunk_count = 10 | |
chunks = split_dictionary_into_chunks(build_path_to_transcript_dict_mls_german(), split_n=chunk_count) | |
for index in range(chunk_count): | |
german_datasets.append(prepare_tts_corpus(transcript_dict=chunks[index], | |
corpus_dir=os.path.join(PREPROCESSING_DIR, f"mls_german_chunk_{index}"), | |
lang="deu", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# FRENCH | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10fr, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_French"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_french, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_french"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_blizzard2023_ad_silence_removed, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "ad_e"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_blizzard2023_neb_silence_removed, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "neb"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_blizzard2023_neb_e_silence_removed, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "neb_e"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_synpaflex_norm_subset, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "synpaflex"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
french_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_siwis_subset, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "siwis"), | |
lang="fra", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# SPANISH | |
spanish_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_spanish, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_spanish"), | |
lang="spa", | |
gpu_count=gpu_count, | |
rank=rank)) | |
spanish_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10es, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Spanish"), | |
lang="spa", | |
gpu_count=gpu_count, | |
rank=rank)) | |
spanish_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_spanish_blizzard_train, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "spanish_blizzard"), | |
lang="spa", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# CHINESE | |
chinese_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10cmn, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_chinese"), | |
lang="cmn", | |
gpu_count=gpu_count, | |
rank=rank)) | |
chinese_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_aishell3, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "aishell3"), | |
lang="cmn", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# PORTUGUESE | |
portuguese_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_portuguese, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_porto"), | |
lang="por", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# POLISH | |
polish_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_polish, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_polish"), | |
lang="pol", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# ITALIAN | |
italian_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_italian, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_italian"), | |
lang="ita", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# DUTCH | |
dutch_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_mls_dutch, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "mls_dutch"), | |
lang="nld", | |
gpu_count=gpu_count, | |
rank=rank)) | |
dutch_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10nl, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Dutch"), | |
lang="nld", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# GREEK | |
greek_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10el, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Greek"), | |
lang="ell", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# FINNISH | |
finnish_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10fi, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Finnish"), | |
lang="fin", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# VIETNAMESE | |
vietnamese_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_VIVOS_viet, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "VIVOS_viet"), | |
lang="vie", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# RUSSIAN | |
russian_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10ru, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Russian"), | |
lang="rus", | |
gpu_count=gpu_count, | |
rank=rank)) | |
# HUNGARIAN | |
hungarian_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_dict_css10hu, | |
corpus_dir=os.path.join(PREPROCESSING_DIR, "css10_Hungarian"), | |
lang="hun", | |
gpu_count=gpu_count, | |
rank=rank)) | |
datasets.append(ConcatDataset(english_datasets)) | |
datasets.append(ConcatDataset(german_datasets)) | |
datasets.append(ConcatDataset(greek_datasets)) | |
datasets.append(ConcatDataset(spanish_datasets)) | |
datasets.append(ConcatDataset(finnish_datasets)) | |
datasets.append(ConcatDataset(russian_datasets)) | |
datasets.append(ConcatDataset(hungarian_datasets)) | |
datasets.append(ConcatDataset(dutch_datasets)) | |
datasets.append(ConcatDataset(french_datasets)) | |
datasets.append(ConcatDataset(portuguese_datasets)) | |
datasets.append(ConcatDataset(polish_datasets)) | |
datasets.append(ConcatDataset(italian_datasets)) | |
datasets.append(ConcatDataset(chinese_datasets)) | |
datasets.append(ConcatDataset(vietnamese_datasets)) | |
model = ToucanTTS() | |
train_samplers = list() | |
if gpu_count > 1: | |
model.to(rank) | |
model = torch.nn.parallel.DistributedDataParallel( | |
model, | |
device_ids=[rank], | |
output_device=rank, | |
find_unused_parameters=True, | |
) | |
torch.distributed.barrier() | |
for train_set in datasets: | |
train_samplers.append(torch.utils.data.RandomSampler(train_set)) | |
if use_wandb: | |
if rank == 0: | |
wandb.init( | |
name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, | |
id=wandb_resume_id, # this is None if not specified in the command line arguments. | |
resume="must" if wandb_resume_id is not None else None) | |
train_loop(net=model, | |
device=torch.device("cuda"), | |
datasets=datasets, | |
save_directory=meta_save_dir, | |
path_to_checkpoint=resume_checkpoint, | |
resume=resume, | |
fine_tune=finetune, | |
steps=200000, | |
steps_per_checkpoint=2000, | |
lr=0.0001, | |
use_wandb=use_wandb, | |
train_samplers=train_samplers, | |
gpu_count=gpu_count) | |
if use_wandb: | |
wandb.finish() |