import os import sys import subprocess import gradio as gr import torch import soundfile as sf from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import librosa from TTS.api import TTS from TTS.utils.manage import ModelManager def install_sentencepiece(): try: # Attempting to install sentencepiece via pip subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"]) except subprocess.CalledProcessError: # Attempt to install sentencepiece via system package manager if pip install fails if os.name == "posix": os.system("sudo apt-get install -y libprotobuf10 protobuf-compiler libprotobuf-dev") os.system("sudo apt-get install -y libsentencepiece-dev") else: raise OSError("Automatic installation of SentencePiece is not supported on this OS") # Call the function to attempt installing SentencePiece install_sentencepiece() # Agreeing to Coqui TTS terms of service and setting up environment variables os.environ["COQUI_TOS_AGREED"] = "1" os.makedirs("output_audio", exist_ok=True) # Initialize ASR model asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") asr_model.eval() # Dynamically list and select TTS model tts_manager = ModelManager() model_name = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name, gpu=False) def resample_audio(input_audio_path, target_sr=16000): waveform, sr = sf.read(input_audio_path) if sr != target_sr: waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr) return waveform def speech_to_text(input_audio_path): waveform = resample_audio(input_audio_path) input_values = asr_processor(waveform, return_tensors="pt").input_values with torch.no_grad(): logits = asr_model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = asr_processor.batch_decode(predicted_ids)[0] return transcription.strip() def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"): if not text.strip(): return "Empty text input." tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path) return output_path def speech_to_speech(input_audio, text_input=None): speaker_wav_path = input_audio if text_input is None: text_input = speech_to_text(input_audio) return text_to_speech(text_input, speaker_wav_path) iface = gr.Interface(fn=speech_to_speech, inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)], outputs=gr.Audio()) iface.launch()