import gradio as gr import torch import librosa import numpy as np # Assuming you have a model file for voice conversion from model import load_model, convert_voice # Load the pre-trained voice conversion model model = load_model("path_to_pretrained_model") # Adjust this based on the actual RVC model def voice_conversion(source_audio, target_voice): """ Function to perform voice conversion from source to target voice style """ # Convert input audio to the desired format (this may vary depending on your model) y, sr = librosa.load(source_audio) input_audio = torch.tensor(y).unsqueeze(0) # Use model for voice conversion converted_audio = convert_voice(model, input_audio, target_voice) # Convert output tensor back to numpy for playback converted_audio_np = converted_audio.detach().cpu().numpy() # Save to file or return as numpy array output_file = "output_converted.wav" librosa.output.write_wav(output_file, converted_audio_np, sr) return output_file # Define the Gradio interface def infer(source_audio, target_voice): # Call the voice conversion function result_audio = voice_conversion(source_audio, target_voice) return result_audio # Gradio interface with inputs and outputs iface = gr.Interface( fn=infer, inputs=[ gr.Audio(source="microphone", type="filepath", label="Source Audio"), gr.Dropdown(["Voice1", "Voice2", "Voice3"], label="Target Voice") # Dropdown for target voice options ], outputs=gr.Audio(type="file", label="Converted Audio"), title="Retrieval-based Voice Conversion", description="Convert voice from a source audio to a target voice style." ) if __name__ == "__main__": iface.launch()