KNN Voice Conversion

import torch
import torchaudio
import spaces
from typing import List
import soundfile as sf
import gradio as gr
import tempfile
import subprocess

def convert_to_16kHz_mono(input_file, output_file):
    """
    Converts an audio file to 16KHz sample rate and single channel (mono) using ffmpeg.

    Parameters:
    input_file (str): Path to the input audio file.
    output_file (str): Path to the output WAV file.
    """
    try:
        # Run the ffmpeg command
        subprocess.run(['ffmpeg', '-y', '-i', input_file, '-ar', '16000', '-ac', '1', output_file], check=True)
        print(f"Conversion complete: {output_file}")
        return output_file
    except subprocess.CalledProcessError as e:
        print(f"An error occurred during conversion: {e}")

def create_temp_wav_file():
    # Create a temporary file using NamedTemporaryFile
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    
    # Get the path of the temporary file
    temp_file_path = temp_file.name
    
    return temp_file_path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device=device)


def convert_voice(src_wav_path:str, ref_wav_paths, top_k:int):

    tmp_src_wav_path = create_temp_wav_file()
    tmp_ref_wav_path = create_temp_wav_file()
    src_wav_path = convert_to_16kHz_mono(src_wav_path, tmp_src_wav_path)
    ref_wav_paths = convert_to_16kHz_mono(ref_wav_paths, tmp_ref_wav_path)

    query_seq = knn_vc.get_features(src_wav_path)
    matching_set = knn_vc.get_matching_set([ref_wav_paths])
    out_wav = knn_vc.match(query_seq, matching_set, topk=int(top_k))

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as converted_file:
            sf.write(converted_file.name, out_wav, 16000, "PCM_24")
    
    return converted_file.name


title =  """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
    <div
        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
        KNN Voice Conversion
    </h1> </div>
</div>
"""     

description = """
Voice Conversion With Just k-Nearest Neighbors. The source and reference utterance(s) are encoded into self-supervised features using WavLM.
Each source feature is assigned to the mean of the k closest features from the reference.
The resulting feature sequence is then vocoded with HiFi-GAN to arrive at the converted waveform output.
"""

article = """
If the model contributes to your research please cite the following work: 

Baas, M., van Niekerk, B., & Kamper, H. (2023). Voice conversion with just nearest neighbors. arXiv preprint arXiv:2305.18975.

demo contributed by [@wetdog](https://github.com/wetdog)
"""
demo = gr.Blocks()
with demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.Interface(
    fn=convert_voice,
    inputs=[
    gr.Audio(type='filepath'),
    gr.Audio(type='filepath'), 
    #gr.File(file_count="multiple", type="filepath", label="Reference Audio Files"),
    gr.Slider(
            3,
            10,
            value=4,
            step=1,
            label="Top-k",
            info=f"These default settings provide pretty good results, but feel free to modify the kNN topk",
        )],
    outputs=[gr.Audio(type='filepath')],
    allow_flagging=False,)
    gr.Markdown(article)

demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)