s3prl-vc-vcc2020

Sleeping

s3prl-vc-vcc2020

File size: 8,009 Bytes


import os
from glob import glob
from loguru import logger
import soundfile as sf
import librosa
import gradio as gr

from huggingface_hub import hf_hub_download
import time
import torch
import yaml

from s3prl_vc.upstream.interface import get_upstream
from s3prl.nn import Featurizer
import s3prl_vc.models
from s3prl_vc.utils import read_hdf5
from s3prl_vc.vocoder import Vocoder


# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'

SERVER_PORT = 42208
SERVER_NAME = "0.0.0.0"
SSL_DIR = './keyble_ssl'

EXAMPLE_DIR = './examples'
en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav')))
jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav')))
zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav')))

TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"]

ref_samples = {
    trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav')))
    for trgspk in TRGSPKS
}

# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')

# ---------- Download models ----------
logger.info('============================= Download models ===========================')

vocoder_paths = {
    "ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"),
    "config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"),
    "stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5")
}

vc_model_paths = {
    trgspk: {
        "ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"), 
        "config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"),
        "stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"),
    } for trgspk in TRGSPKS
}

# ---------- Model ----------
vc_models = {}
for trgspk in TRGSPKS:
    logger.info(f'============================= Setting up model for {trgspk} =============')
    checkpoint_path = vc_model_paths[trgspk]["ckpt"]
    config_path = vc_model_paths[trgspk]["config"]
    stats_path = vc_model_paths[trgspk]["stats"]
    with open(config_path) as f:
        config = yaml.load(f, Loader=yaml.Loader)

    config["trg_stats"] = {
        "mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE),
        "scale": torch.from_numpy(read_hdf5(stats_path, "scale"))
        .float()
        .to(DEVICE),
    }
    
    # define upstream model
    upstream_model = get_upstream(config["upstream"]).to(DEVICE)
    upstream_model.eval()
    upstream_featurizer = Featurizer(upstream_model).to(DEVICE)
    upstream_featurizer.load_state_dict(
        torch.load(checkpoint_path, map_location="cpu")["featurizer"]
    )
    upstream_featurizer.eval()

    # get model and load parameters
    model_class = getattr(s3prl_vc.models, config["model_type"])
    model = model_class(
        upstream_featurizer.output_size,
        config["num_mels"],
        config["sampling_rate"]
        / config["hop_size"]
        * upstream_featurizer.downsample_rate
        / 16000,
        config["trg_stats"],
        use_spemb=config.get("use_spk_emb", False),
        **config["model_params"],
    ).to(DEVICE)
    model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"])
    model = model.eval().to(DEVICE)
    logger.info(f"Loaded model parameters from {checkpoint_path}.")

    # load vocoder
    vocoder = Vocoder(
        vocoder_paths["ckpt"],
        vocoder_paths["config"],
        vocoder_paths["stats"],
        config["trg_stats"],
        DEVICE,
    )

    vc_models[trgspk] = {
        "upstream": upstream_model,
        "featurizer": upstream_featurizer,
        "decoder": model,
        "vocoder": vocoder
    }

def predict(trgspk, wav_file):
    x, fs = librosa.load(wav_file, sr=16000)
    logger.info('wav file loaded')

    with torch.no_grad():
        start_time = time.time()
        xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE)
        ilens = torch.LongTensor([x.shape[0]]).to(DEVICE)

        all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens)
        logger.info('upstream done')
        
        hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens)
        logger.info('featurizer done')
        
        outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None)
        logger.info('downstream done')
        
        out = outs[0]
        y, sr = vc_models[trgspk]["vocoder"].decode(out)
        logger.info('vocoder done')
        sf.write(
            "out.wav",
            y.cpu().numpy(),
            24000,
            "PCM_16",
        )
        logger.info('write done')
        logger.info('RTF={}'.format(
            (time.time() - start_time) / (len(x) / 16000)
        ))

    return "out.wav"

with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo:
    gr.Markdown(
        """
        # S3PRL-VC: Any-to-one voice conversion demo on VCC2020

        ### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc)

        **S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training.

        In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like.

        The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output.
        """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Upload a .wav file here!")
            input_wav = gr.Audio(label="Source speech", source='upload', type='filepath')

            gr.Markdown("## Select a target speaker!")
            trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"])
            gr.Markdown("### Here is what the target speaker sounds like!")
            ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath")
            ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath")
            trgspk.change(lambda trgspk: ref_samples[trgspk],
                          inputs = trgspk,
                          outputs = [ref_sample_wav1, ref_sample_wav2]
            )

            convert_btn = gr.Button(value="Convert!")
            gr.Markdown("### You can use these examples if using a microphone is too troublesome!")
            gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.")
            gr.Examples(
                examples=en_examples,
                inputs=input_wav,
                label="English examples"
            )
            gr.Examples(
                examples=jp_examples,
                inputs=input_wav,
                label="Japanese examples"
            )
            gr.Examples(
                examples=zh_examples,
                inputs=input_wav,
                label="Mandarin examples"
            )
        
        with gr.Column():
            gr.Markdown("## Listen to the converted speech here!")
            output_wav = gr.Audio(type="filepath", label="Converted speech")
        convert_btn.click(predict, [trgspk, input_wav], output_wav)

if __name__ == '__main__':
    try:
        demo.launch(debug=True,
                     enable_queue=True,
                     )    
    except KeyboardInterrupt as e:
        print(e)

    finally:
        demo.close()