LPhilp1943's picture
Update app.py
0431ea7 verified
history blame
No virus
2.41 kB
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
import string
# Set environment variable for Coqui TTS
os.environ["COQUI_TTS_MODEL"] = "1"
from TTS.api import TTS # Importing Coqui TTS
os.makedirs("output_audio", exist_ok=True)
# Initialize ASR model
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# Load Coqui TTS model directly
tts = TTS("tts_models/en/ljspeech/tacotron2-DCA", gpu=False) # Adjust model path and GPU flag as needed
def resample_audio(input_audio_path, target_sr):
waveform, sr = sf.read(input_audio_path)
if sr != target_sr:
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
return waveform
def speech_to_text(input_audio_path):
waveform = resample_audio(input_audio_path, 16000)
input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = asr_model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = asr_processor.batch_decode(predicted_ids)[0]
return transcription.strip()
def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav", language="en"):
if not text.strip():
return "The text input is empty, please provide a valid string."
# Generate speech using Coqui TTS
return output_path
def speech_to_speech(input_audio, text_input=None):
speaker_wav_path = input_audio # Use input audio as speaker reference
if text_input is None:
text_input = speech_to_text(input_audio)
synthesized_speech_path = text_to_speech(text_input, speaker_wav_path)
return synthesized_speech_path
iface = gr.Interface(
inputs=[gr.Audio(type="filepath", label="Input Audio"), gr.Textbox(label="Text Input", optional=True)],
outputs=gr.Audio(label="Synthesized Speech"),
title="Speech-to-Speech Application",
description="Converts speech to text and then back to speech, using Coqui TTS for voice generation."