import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import spaces
import gc
from functools import partial
import gradio as gr
import torch
from speechbrain.inference.interfaces import Pretrained, foreign_class
from transformers import T5Tokenizer, T5ForConditionalGeneration
import librosa
import whisper_timestamped as whisper
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True


def clean_up_memory():
    gc.collect()
    torch.cuda.empty_cache()

@spaces.GPU(duration=15)
def recap_sentence(string):
    # Restore capitalization and punctuation using the model
    inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
    outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
    recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
    return recap_result


@spaces.GPU(duration=30)
def return_prediction_w2v2_mic(mic=None, progress=gr.Progress(), device=device):
    progress(0, desc="Транскриптот се генерира")
    if mic is not None:
        download_path = mic.split(".")[0] + ".txt"
        waveform, sr = librosa.load(mic, sr=16000)
        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
    else:
        return "You must either provide a mic recording or a file"

    recap_result = ""
    prev_segment = ""
    prev_segment_len = 0

    progress(0.75, desc=" Пост-процесирање на транскриптот")
    for k, segment in enumerate(w2v2_result):
        progress(0.75, desc=" Пост-процесирање на транскриптот")

        if prev_segment == "":
            recap_segment= recap_sentence(segment)
        else:
            prev_segment_len = len(prev_segment.split())
            recap_segment = recap_sentence(prev_segment + " " + segment)
        # remove prev_segment from the beginning of the recap_result
        recap_segment = recap_segment.split()
        recap_segment = recap_segment[prev_segment_len:]
        recap_segment = " ".join(recap_segment)
        prev_segment = segment[0]
        recap_result += recap_segment + " "

        # If the letter after punct is small, recap it
        for i, letter in enumerate(recap_result):
            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

        clean_up_memory()

    progress(1.0, desc=" Крај на транскрипцијата")
    with open(download_path, "w") as f:
        f.write(recap_result)

    return recap_result, download_path


@spaces.GPU(duration=60)
def return_prediction_w2v2_file(file=None, progress=gr.Progress(), device=device):
    progress(0, desc="Транскриптот се генерира")
    if file is not None:
        download_path = file.split(".")[0] + ".txt"
        waveform, sr = librosa.load(file, sr=16000)
        w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
    else:
        return "You must either provide a mic recording or a file"

    recap_result = ""
    prev_segment = ""
    prev_segment_len = 0

    progress(0.75, desc=" Пост-процесирање на транскриптот")
    for k, segment in enumerate(w2v2_result):
        progress(0.75, desc=" Пост-процесирање на транскриптот")

        if prev_segment == "":
            recap_segment= recap_sentence(segment)
        else:
            prev_segment_len = len(prev_segment.split())
            recap_segment = recap_sentence(prev_segment + " " + segment)
        # remove prev_segment from the beginning of the recap_result
        recap_segment = recap_segment.split()
        recap_segment = recap_segment[prev_segment_len:]
        recap_segment = " ".join(recap_segment)
        prev_segment = segment[0]
        recap_result += recap_segment + " "

        # If the letter after punct is small, recap it
        for i, letter in enumerate(recap_result):
            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

        clean_up_memory()

    progress(1.0, desc=" Крај на транскрипцијата")
    with open(download_path, "w") as f:
        f.write(recap_result)

    return recap_result, download_path


# Create a partial function with the device pre-applied
return_prediction_w2v2_mic_with_device = partial(return_prediction_w2v2_mic, device=device)
return_prediction_w2v2_file_with_device = partial(return_prediction_w2v2_file, device=device)


# Load the ASR models
w2v2_classifier = foreign_class(source="Macedonian-ASR/buki-wav2vec2-2.0", pymodule_file="custom_interface_app.py", classname="ASR")
w2v2_classifier = w2v2_classifier.to(device)
w2v2_classifier.eval()


# Load the T5 tokenizer and model for restoring capitalization
recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
recap_model.to(device)
recap_model.eval()


with gr.Blocks() as mic_transcribe_wav2vec2:
    def clear_outputs():
        return None, "", None
    
    with gr.Row():
        audio_input = gr.Audio(sources="microphone", type="filepath", label="Record Audio")
    with gr.Row():
        transcribe_button = gr.Button("Транскрибирај")
        clear_button = gr.Button("Исчисти ги резултатите")
    with gr.Row():
        output_text = gr.Textbox(label="Транскрипција")
    with gr.Row():
        download_file = gr.File(label="Зачувај го транскриптот", file_count="single")

    transcribe_button.click(
        fn=return_prediction_w2v2_mic_with_device,
        inputs=[audio_input],
        outputs=[output_text, download_file],
    )
    clear_button.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[audio_input, output_text, download_file],
)


with gr.Blocks() as file_transcribe_wav2vec2:
    def clear_outputs():
        return {audio_input: None, output_text: "", download_file: None}
    
    with gr.Row():
        audio_input = gr.Audio(sources="upload", type="filepath", label="Record Audio")
    with gr.Row():
        transcribe_button = gr.Button("Транскрибирај")
        clear_button = gr.Button("Исчисти ги резултатите")
    with gr.Row():
        output_text = gr.Textbox(label="Транскрипција")
    with gr.Row():
        download_file = gr.File(label="Зачувај го транскриптот", file_count="single")

    transcribe_button.click(
        fn=return_prediction_w2v2_file_with_device,
        inputs=[audio_input],
        outputs=[output_text, download_file],
    )
    clear_button.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[audio_input, output_text, download_file],
    )
    

project_description = '''
<img src="https://i.imghippo.com/files/JXadQ1728417387.png"
     alt="Bookie logo"
     style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
     
## Автори:
1. **Дејан Порјазовски**
2. **Илина Јакимовска**
3. **Ордан Чукалиев**
4. **Никола Стиков**

Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
'''

# Custom CSS
css = """
.gradio-container {
    background-color: #f0f0f0;  /* Set your desired background color */
}
.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a, .custom-markdown strong {
    font-size: 15px !important;
    font-family: Arial, sans-serif !important;
    color: black !important;
}
button {
    color: orange !important;
}
.gradio-container {
    background-color: #f3f3f3 !important;
}
"""

transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
    
with transcriber_app:
    state = gr.State()
    gr.Markdown(project_description, elem_classes="custom-markdown")

    # gr.TabbedInterface(
    #     [mic_transcribe_whisper, mic_transcribe_compare],
    #     ["Буки-Whisper транскрипција", "Споредба на модели"],
    # )
    # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

    gr.TabbedInterface(
        [mic_transcribe_wav2vec2, file_transcribe_wav2vec2],
        ["Буки-w2v2 транскрипција од микрофон", "Буки-w2v2 транскрипција од фајл"],
    )
    state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

    transcriber_app.unload(return_prediction_w2v2_mic_with_device)
    transcriber_app.unload(return_prediction_w2v2_file_with_device)


# transcriber_app.launch(debug=True, share=True, ssl_verify=False)
if __name__ == "__main__":
    transcriber_app.queue()
    transcriber_app.launch()