Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

speech_2_speech_voice_cloning / app.py

LPhilp1943

Update app.py

5cb3ee9 verified 7 months ago

raw

history blame

No virus

2.72 kB

	import os
	import sys
	import subprocess
	import gradio as gr
	import torch
	import soundfile as sf
	from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
	import librosa
	from TTS.api import TTS
	from TTS.utils.manage import ModelManager

	def install_sentencepiece():
	try:
	# Attempting to install sentencepiece via pip
	subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
	except subprocess.CalledProcessError:
	# Attempt to install sentencepiece via system package manager if pip install fails
	if os.name == "posix":
	os.system("sudo apt-get install -y libprotobuf10 protobuf-compiler libprotobuf-dev")
	os.system("sudo apt-get install -y libsentencepiece-dev")
	else:
	raise OSError("Automatic installation of SentencePiece is not supported on this OS")

	# Call the function to attempt installing SentencePiece
	install_sentencepiece()

	# Agreeing to Coqui TTS terms of service and setting up environment variables
	os.environ["COQUI_TOS_AGREED"] = "1"
	os.makedirs("output_audio", exist_ok=True)

	# Initialize ASR model
	asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
	asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
	asr_model.eval()

	# Dynamically list and select TTS model
	tts_manager = ModelManager()
	model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
	tts = TTS(model_name, gpu=False)

	def resample_audio(input_audio_path, target_sr=16000):
	waveform, sr = sf.read(input_audio_path)
	if sr != target_sr:
	waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
	return waveform

	def speech_to_text(input_audio_path):
	waveform = resample_audio(input_audio_path)
	input_values = asr_processor(waveform, return_tensors="pt").input_values
	with torch.no_grad():
	logits = asr_model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = asr_processor.batch_decode(predicted_ids)[0]
	return transcription.strip()

	def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
	if not text.strip():
	return "Empty text input."
	tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
	return output_path

	def speech_to_speech(input_audio, text_input=None):
	speaker_wav_path = input_audio
	if text_input is None:
	text_input = speech_to_text(input_audio)
	return text_to_speech(text_input, speaker_wav_path)

	iface = gr.Interface(fn=speech_to_speech,
	inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
	outputs=gr.Audio())
	iface.launch()