import os
import sys
import subprocess
import gradio as gr
import torch
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
from TTS.api import TTS
from TTS.utils.manage import ModelManager

def install_sentencepiece():
    try:
        # Attempting to install sentencepiece via pip
        subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
    except subprocess.CalledProcessError:
        # Attempt to install sentencepiece via system package manager if pip install fails
        if os.name == "posix":
            os.system("sudo apt-get install -y libprotobuf10 protobuf-compiler libprotobuf-dev")
            os.system("sudo apt-get install -y libsentencepiece-dev")
        else:
            raise OSError("Automatic installation of SentencePiece is not supported on this OS")

# Call the function to attempt installing SentencePiece
install_sentencepiece()

# Agreeing to Coqui TTS terms of service and setting up environment variables
os.environ["COQUI_TOS_AGREED"] = "1"
os.makedirs("output_audio", exist_ok=True)

# Initialize ASR model
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
asr_model.eval()

# Dynamically list and select TTS model
tts_manager = ModelManager()
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)

def resample_audio(input_audio_path, target_sr=16000):
    waveform, sr = sf.read(input_audio_path)
    if sr != target_sr:
        waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
    return waveform

def speech_to_text(input_audio_path):
    waveform = resample_audio(input_audio_path)
    input_values = asr_processor(waveform, return_tensors="pt").input_values
    with torch.no_grad():
        logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.batch_decode(predicted_ids)[0]
    return transcription.strip()

def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
    if not text.strip():
        return "Empty text input."
    tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
    return output_path

def speech_to_speech(input_audio, text_input=None):
    speaker_wav_path = input_audio
    if text_input is None:
        text_input = speech_to_text(input_audio)
    return text_to_speech(text_input, speaker_wav_path)

iface = gr.Interface(fn=speech_to_speech,
                     inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
                     outputs=gr.Audio())
iface.launch()