Spaces:
Build error
Build error
File size: 1,990 Bytes
41ad754 aa3f3ad ff017ef 41ad754 7fa5660 7ca94c8 aa3f3ad 0431ea7 aa3f3ad 41ad754 7fa5660 aad7d40 7fa5660 aa3f3ad 7fa5660 aa3f3ad 68ecc2a 7fa5660 aa3f3ad 7fa5660 5e6eee9 41ad754 aa3f3ad 592ca27 aa3f3ad 7fa5660 68ecc2a 6275fb1 aa3f3ad 7fa5660 aa3f3ad 4bc4442 aa3f3ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import gradio as gr
import torch
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
from TTS.api import TTS
from TTS.utils.manage import ModelManager
# Agreeing to Coqui TTS terms of service and setting up environment variables
os.environ["COQUI_TOS_AGREED"] = "1"
os.makedirs("output_audio", exist_ok=True)
# Initialize ASR model
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
asr_model.eval()
# Dynamically list and select TTS model
tts_manager = ModelManager()
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)
def resample_audio(input_audio_path, target_sr=16000):
waveform, sr = sf.read(input_audio_path)
if sr != target_sr:
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
return waveform
def speech_to_text(input_audio_path):
waveform = resample_audio(input_audio_path)
input_values = asr_processor(waveform, return_tensors="pt").input_values
with torch.no_grad():
logits = asr_model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = asr_processor.batch_decode(predicted_ids)[0]
return transcription.strip()
def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
if not text.strip():
return "Empty text input."
tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
return output_path
def speech_to_speech(input_audio, text_input=None):
speaker_wav_path = input_audio
if text_input is None:
text_input = speech_to_text(input_audio)
return text_to_speech(text_input, speaker_wav_path)
iface = gr.Interface(fn=speech_to_speech,
inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
outputs=gr.Audio())
iface.launch()
|