LPhilp1943's picture
Update app.py
ff017ef verified
raw
history blame
No virus
2.42 kB
import gradio as gr
import os
import torch
import soundfile as sf
import torchaudio
from scipy.io.wavfile import write
from transformers import VitsProcessor, VitsForConditionalGeneration
from speechbrain.pretrained import EncoderClassifier, EncoderDecoderASR
# Ensure the output directory exists
os.makedirs("output_audio", exist_ok=True)
# Load the Facebook MMS TTS model and processor
tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
# SpeechBrain ASR Model for Speech to Text
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-conformer-transformerlm-librispeech", savedir="models/asr")
def speech_to_text(input_audio):
sig, sr = torchaudio.load(input_audio)
if sr != 16000:
sig = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(sig)
transcription = asr_model.transcribe_file(input_audio)
return transcription
def text_to_speech(text):
inputs = tts_processor(text, return_tensors="pt")
with torch.no_grad():
generated = tts_model.generate(**inputs)
waveform = generated.audio.squeeze().cpu().numpy()
output_path = "output_audio/text_to_speech.wav"
sf.write(output_path, waveform, 22050)
return output_path
def speech_to_speech(input_audio, target_text):
# Speech to Text
transcription = speech_to_text(input_audio)
# Text to Speech with Facebook MMS TTS
return text_to_speech(target_text)
iface = gr.Interface(
fn={
"Speech to Text": speech_to_text,
"Text to Speech": text_to_speech,
"Speech to Speech": speech_to_speech
},
inputs={
"Speech to Text": gr.inputs.Audio(source="upload", type="file"),
"Text to Speech": gr.inputs.Textbox(label="Text"),
"Speech to Speech": [gr.inputs.Audio(source="upload", type="file"), gr.inputs.Textbox(label="Target Text")]
},
outputs={
"Speech to Text": gr.outputs.Textbox(label="Transcription"),
"Text to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech"),
"Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
},
title="Speech Processing App",
description="This app uses SpeechBrain for speech to text and Facebook's MMS for text to speech.",
layout="vertical"
)
if __name__ == "__main__":
iface.launch()