Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

speech_2_speech_voice_cloning / app.py

LPhilp1943

Create

41ad754 verified 7 months ago

raw

history blame

No virus

2.63 kB

	import gradio as gr
	import torchaudio
	from speechbrain.pretrained import EncoderClassifier, Tacotron2, HIFIGAN, ASR
	import os
	import soundfile as sf

	# Ensure output directory exists
	os.makedirs("output_audio", exist_ok=True)

	# Load models
	encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="models/encoder")
	tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="models/tacotron2")
	hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="models/hifigan")
	asr = ASR.from_hparams(source="speechbrain/asr-transformer-librispeech", savedir="models/asr")

	def speech_to_text(input_audio):
	sig, sr = torchaudio.load(input_audio)
	transcription = asr.transcribe_file(input_audio)
	return transcription

	def speech_to_speech(input_audio, target_text):
	# Load and encode speaker from input audio
	signal, fs = torchaudio.load(input_audio)
	if fs != 16000:
	signal = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(signal)
	embedding = encoder.encode_batch(signal)

	# Synthesize speech from text
	mel_output, mel_length, alignment = tacotron2.encode_text(target_text, embedding)
	waveform = hifigan.decode_batch(mel_output)

	# Save output audio
	output_path = "output_audio/synthesized_speech.wav"
	sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
	return output_path

	def text_to_speech(text):
	mel_output, mel_length, alignment = tacotron2.encode_text(text)
	waveform = hifigan.decode_batch(mel_output)

	output_path = "output_audio/text_to_speech.wav"
	sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
	return output_path

	iface = gr.Interface(
	fn={
	"Speech to Text": speech_to_text,
	"Text to Speech": text_to_speech,
	"Speech to Speech": speech_to_speech
	},
	inputs={
	"Speech to Text": gr.inputs.Audio(source="upload", type="file"),
	"Text to Speech": gr.inputs.Textbox(label="Text"),
	"Speech to Speech": [gr.inputs.Audio(source="upload", type="file"), gr.inputs.Textbox(label="Target Text")]
	},
	outputs={
	"Speech to Text": gr.outputs.Textbox(label="Transcription"),
	"Text to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech"),
	"Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
	},
	title="Speech Processing App",
	description="Upload an audio file or enter text to perform various speech processing tasks.",
	layout="vertical"
	)

	if __name__ == "__main__":
	iface.launch()