Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

speech_2_speech_voice_cloning / app.py

LPhilp1943

Update app.py

8dc6b2e verified 7 months ago

raw

history blame

No virus

2.44 kB

	import gradio as gr
	import os
	import torch
	import soundfile as sf
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer

	# Ensure the output directory exists
	os.makedirs("output_audio", exist_ok=True)

	# Load the models and processors
	asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
	asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
	tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
	tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

	def speech_to_text(input_audio):
	# Load and preprocess the audio
	waveform, sr = sf.read(input_audio)
	input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values

	# Perform speech recognition
	with torch.no_grad():
	logits = asr_model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)

	# Decode the predicted IDs to text
	transcription = asr_processor.batch_decode(predicted_ids)[0]
	return transcription

	def text_to_speech(text):
	# Tokenize text and generate waveform
	inputs = tts_tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	output = tts_model(**inputs).waveform
	waveform = output.numpy()

	# Define output path and save waveform as audio file
	output_path = "output_audio/text_to_speech.wav"
	sf.write(output_path, waveform.squeeze(), 22050)

	return output_path

	def speech_to_speech(input_audio, target_text):
	# Synthesize speech directly from target text without transcribing the input audio
	return text_to_speech(target_text)

	iface = gr.Interface(
	fn={
	"Speech to Text": speech_to_text,
	"Text to Speech": text_to_speech,
	"Speech to Speech": speech_to_speech
	},
	inputs=[
	gr.Audio(label="Speech to Text"),
	gr.Textbox(label="Text to Speech"),
	[gr.Audio(label="Speech to Speech Input"), gr.Textbox(label="Target Text for Speech to Speech")] # Corrected: Use a list for multiple inputs
	],
	outputs=[
	gr.Textbox(label="Transcription"),
	gr.Audio(label="Synthesized Speech"),
	gr.Audio(label="Speech to Speech Output")
	],
	title="Speech Processing Application",
	description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech.",
	layout="vertical"
	)

	if __name__ == "__main__":
	iface.launch()