import os import gradio as gr import torch import soundfile as sf from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import librosa from TTS.api import TTS from TTS.utils.manage import ModelManager # Agreeing to Coqui TTS terms of service and setting up environment variables os.environ["COQUI_TOS_AGREED"] = "1" os.makedirs("output_audio", exist_ok=True) # Initialize ASR model asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") asr_model.eval() # Dynamically list and select TTS model tts_manager = ModelManager() model_name = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name, gpu=False) def resample_audio(input_audio_path, target_sr=16000): waveform, sr = sf.read(input_audio_path) if sr != target_sr: waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr) return waveform def speech_to_text(input_audio_path): waveform = resample_audio(input_audio_path) input_values = asr_processor(waveform, return_tensors="pt").input_values with torch.no_grad(): logits = asr_model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = asr_processor.batch_decode(predicted_ids)[0] return transcription.strip() def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"): if not text.strip(): return "Empty text input." tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path) return output_path def speech_to_speech(input_audio, text_input=None): speaker_wav_path = input_audio if text_input is None: text_input = speech_to_text(input_audio) return text_to_speech(text_input, speaker_wav_path) iface = gr.Interface(fn=speech_to_speech, inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)], outputs=gr.Audio()) iface.launch()