import os from glob import glob from loguru import logger import soundfile as sf import librosa import gradio as gr from huggingface_hub import hf_hub_download import time import torch import yaml from s3prl_vc.upstream.interface import get_upstream from s3prl.nn import Featurizer import s3prl_vc.models from s3prl_vc.utils import read_hdf5 from s3prl_vc.vocoder import Vocoder # ---------- Settings ---------- GPU_ID = '-1' os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' SERVER_PORT = 42208 SERVER_NAME = "0.0.0.0" SSL_DIR = './keyble_ssl' EXAMPLE_DIR = './examples' en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav'))) jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav'))) zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav'))) TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"] ref_samples = { trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav'))) for trgspk in TRGSPKS } # ---------- Logging ---------- logger.add('app.log', mode='a') logger.info('============================= App restarted =============================') # ---------- Download models ---------- logger.info('============================= Download models ===========================') vocoder_paths = { "ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"), "config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"), "stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5") } vc_model_paths = { trgspk: { "ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"), "config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"), "stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"), } for trgspk in TRGSPKS } # ---------- Model ---------- vc_models = {} for trgspk in TRGSPKS: logger.info(f'============================= Setting up model for {trgspk} =============') checkpoint_path = vc_model_paths[trgspk]["ckpt"] config_path = vc_model_paths[trgspk]["config"] stats_path = vc_model_paths[trgspk]["stats"] with open(config_path) as f: config = yaml.load(f, Loader=yaml.Loader) config["trg_stats"] = { "mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE), "scale": torch.from_numpy(read_hdf5(stats_path, "scale")) .float() .to(DEVICE), } # define upstream model upstream_model = get_upstream(config["upstream"]).to(DEVICE) upstream_model.eval() upstream_featurizer = Featurizer(upstream_model).to(DEVICE) upstream_featurizer.load_state_dict( torch.load(checkpoint_path, map_location="cpu")["featurizer"] ) upstream_featurizer.eval() # get model and load parameters model_class = getattr(s3prl_vc.models, config["model_type"]) model = model_class( upstream_featurizer.output_size, config["num_mels"], config["sampling_rate"] / config["hop_size"] * upstream_featurizer.downsample_rate / 16000, config["trg_stats"], use_spemb=config.get("use_spk_emb", False), **config["model_params"], ).to(DEVICE) model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]) model = model.eval().to(DEVICE) logger.info(f"Loaded model parameters from {checkpoint_path}.") # load vocoder vocoder = Vocoder( vocoder_paths["ckpt"], vocoder_paths["config"], vocoder_paths["stats"], config["trg_stats"], DEVICE, ) vc_models[trgspk] = { "upstream": upstream_model, "featurizer": upstream_featurizer, "decoder": model, "vocoder": vocoder } def predict(trgspk, wav_file): x, fs = librosa.load(wav_file, sr=16000) logger.info('wav file loaded') with torch.no_grad(): start_time = time.time() xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE) ilens = torch.LongTensor([x.shape[0]]).to(DEVICE) all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens) logger.info('upstream done') hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens) logger.info('featurizer done') outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None) logger.info('downstream done') out = outs[0] y, sr = vc_models[trgspk]["vocoder"].decode(out) logger.info('vocoder done') sf.write( "out.wav", y.cpu().numpy(), 24000, "PCM_16", ) logger.info('write done') logger.info('RTF={}'.format( (time.time() - start_time) / (len(x) / 16000) )) return "out.wav" with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo: gr.Markdown( """ # S3PRL-VC: Any-to-one voice conversion demo on VCC2020 ### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc) **S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training. In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like. The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output. """ ) with gr.Row(): with gr.Column(): gr.Markdown("## Upload a .wav file here!") input_wav = gr.Audio(label="Source speech", source='upload', type='filepath') gr.Markdown("## Select a target speaker!") trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"]) gr.Markdown("### Here is what the target speaker sounds like!") ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath") ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath") trgspk.change(lambda trgspk: ref_samples[trgspk], inputs = trgspk, outputs = [ref_sample_wav1, ref_sample_wav2] ) convert_btn = gr.Button(value="Convert!") gr.Markdown("### You can use these examples if using a microphone is too troublesome!") gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.") gr.Examples( examples=en_examples, inputs=input_wav, label="English examples" ) gr.Examples( examples=jp_examples, inputs=input_wav, label="Japanese examples" ) gr.Examples( examples=zh_examples, inputs=input_wav, label="Mandarin examples" ) with gr.Column(): gr.Markdown("## Listen to the converted speech here!") output_wav = gr.Audio(type="filepath", label="Converted speech") convert_btn.click(predict, [trgspk, input_wav], output_wav) if __name__ == '__main__': try: demo.launch(debug=True, enable_queue=True, ) except KeyboardInterrupt as e: print(e) finally: demo.close()