LPhilp1943's picture
Update app.py
8dc6b2e verified
raw
history blame
No virus
2.44 kB
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
# Ensure the output directory exists
os.makedirs("output_audio", exist_ok=True)
# Load the models and processors
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
def speech_to_text(input_audio):
# Load and preprocess the audio
waveform, sr = sf.read(input_audio)
input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values
# Perform speech recognition
with torch.no_grad():
logits = asr_model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the predicted IDs to text
transcription = asr_processor.batch_decode(predicted_ids)[0]
return transcription
def text_to_speech(text):
# Tokenize text and generate waveform
inputs = tts_tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = tts_model(**inputs).waveform
waveform = output.numpy()
# Define output path and save waveform as audio file
output_path = "output_audio/text_to_speech.wav"
sf.write(output_path, waveform.squeeze(), 22050)
return output_path
def speech_to_speech(input_audio, target_text):
# Synthesize speech directly from target text without transcribing the input audio
return text_to_speech(target_text)
iface = gr.Interface(
fn={
"Speech to Text": speech_to_text,
"Text to Speech": text_to_speech,
"Speech to Speech": speech_to_speech
},
inputs=[
gr.Audio(label="Speech to Text"),
gr.Textbox(label="Text to Speech"),
[gr.Audio(label="Speech to Speech Input"), gr.Textbox(label="Target Text for Speech to Speech")] # Corrected: Use a list for multiple inputs
],
outputs=[
gr.Textbox(label="Transcription"),
gr.Audio(label="Synthesized Speech"),
gr.Audio(label="Speech to Speech Output")
],
title="Speech Processing Application",
description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech.",
layout="vertical"
)
if __name__ == "__main__":
iface.launch()