Spaces:

Macedonian-ASR
/

Bookie-w2v2-Macedonian-ASR

Running on Zero

File size: 11,031 Bytes

14c8a1d
 
 
deb4ddc
14c8a1d
 
 
 
 
 
 
 
 
7d2b26f
14c8a1d
 
 
 
 
7d2b26f
 
 
 
 
 
14c8a1d
 
 
 
 
6f5add8
14c8a1d
 
 
 
 
 
 
 
6f5add8
7d2b26f
14c8a1d
19502c3
7d2b26f
62777f8
08c3820
 
1258887
 
 
 
14c8a1d
19502c3
 
 
14c8a1d
19502c3
 
c1c46ce
19502c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51b25d9
 
 
 
14c8a1d
 
78caa12
7d2b26f
8218447
 
d47fadb
62777f8
08c3820
 
1258887
 
 
 
14c8a1d
8218447
 
 
14c8a1d
8218447
 
 
 
 
 
 
 
 
 
 
 
14c8a1d
8218447
 
 
 
 
 
 
51b25d9
 
 
 
14c8a1d
 
 
7d2b26f
 
14c8a1d
 
 
6f5add8
14c8a1d
 
 
 
 
 
 
 
 
 
 
 
ad905f8
51cb5d9
3414dcf
43891b2
ad905f8
43891b2
ad905f8
51cb5d9
 
 
3414dcf
43891b2
51cb5d9
43891b2
51cb5d9
14c8a1d
43891b2
758db3d
 
 
 
 
 
 
da61510
 
 
 
 
 
2dc0fa6
758db3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ad9e87
 
 
 
 
 
8b6de2d
9ad9e87
 
8b6de2d
9ad9e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc0fa6
8b6de2d
 
 
2dc0fa6
8b6de2d
14488a5
9ad9e87
 
3414dcf
14c8a1d
 
 
 
758db3d
 
e963d01
14c8a1d
8218447
 
14c8a1d
758db3d
 
 
14c8a1d
 
58233d9
 
14c8a1d

import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import spaces
import gc
from functools import partial
import gradio as gr
import torch
from speechbrain.inference.interfaces import Pretrained, foreign_class
from transformers import T5Tokenizer, T5ForConditionalGeneration
import librosa
import whisper_timestamped as whisper
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
from speechbrain.inference.VAD import VAD
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True

# Load the VAD model
vad_model = VAD.from_hparams(
    source="speechbrain/vad-crdnn-libriparty",
    savedir="vad_model",
    )


def clean_up_memory():
    gc.collect()
    torch.cuda.empty_cache()

@spaces.GPU(duration=15)
def recap_sentence(string):
    # Restore capitalization and punctuation using the model
    inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
    outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
    recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
    return recap_result


@spaces.GPU(duration=30)
def return_prediction_w2v2_mic(mic=None, vad_model=vad_model, device=device):
    if mic is not None:
        download_path = mic.split(".")[0] + ".txt"
        w2v2_result = w2v2_classifier.classify_file_w2v2(mic, vad_model, device)
    else:
        recap_result = ""
        w2v2_result = ""
        download_path = "empty.txt"
        with open(download_path, "w") as f:
            f.write(recap_result)
        yield recap_result, download_path

    recap_result = ""
    prev_segment = ""
    prev_segment_len = 0

    for k, segment in enumerate(w2v2_result):
        if prev_segment == "":
            recap_segment = recap_sentence(segment)
        else:
            prev_segment_len = len(prev_segment.split())
            recap_segment = recap_sentence(prev_segment + " " + segment)
        # remove prev_segment from the beginning of the recap_result
        recap_segment = recap_segment.split()
        recap_segment = recap_segment[prev_segment_len:]
        recap_segment = " ".join(recap_segment)
        prev_segment = segment[0]
        recap_result += recap_segment + " "

        # If the letter after punct is small, recap it
        for i, letter in enumerate(recap_result):
            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

        clean_up_memory()

        with open(download_path, "w") as f:
            f.write(recap_result)
    
        yield recap_result, download_path


@spaces.GPU(duration=30)
def return_prediction_w2v2_file(file=None, vad_model=vad_model, device=device):
    if file is not None:
        download_path = file.split(".")[0] + ".txt"
        w2v2_result = w2v2_classifier.classify_file_w2v2(file, vad_model, device)
    else:
        recap_result = ""
        w2v2_result = ""
        download_path = "empty.txt"
        with open(download_path, "w") as f:
            f.write(recap_result)
        yield recap_result, download_path

    recap_result = ""
    prev_segment = ""
    prev_segment_len = 0

    for k, segment in enumerate(w2v2_result):
        if prev_segment == "":
            recap_segment= recap_sentence(segment)
        else:
            prev_segment_len = len(prev_segment.split())
            recap_segment = recap_sentence(prev_segment + " " + segment)
        # remove prev_segment from the beginning of the recap_result
        recap_segment = recap_segment.split()
        recap_segment = recap_segment[prev_segment_len:]
        recap_segment = " ".join(recap_segment)
        prev_segment = segment[0]
        recap_result += recap_segment + " "

        # If the letter after punct is small, recap it
        for i, letter in enumerate(recap_result):
            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]

        clean_up_memory()

        with open(download_path, "w") as f:
            f.write(recap_result)
    
        yield recap_result, download_path


# Create a partial function with the device pre-applied
return_prediction_w2v2_mic_with_device = partial(return_prediction_w2v2_mic, vad_model=vad_model, device=device)
return_prediction_w2v2_file_with_device = partial(return_prediction_w2v2_file, vad_model=vad_model, device=device)


# Load the ASR models
w2v2_classifier = foreign_class(source="Macedonian-ASR/buki-wav2vec2-2.0", pymodule_file="custom_interface_app.py", classname="ASR")
w2v2_classifier = w2v2_classifier.to(device)
w2v2_classifier.eval()


# Load the T5 tokenizer and model for restoring capitalization
recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
recap_model.to(device)
recap_model.eval()


mic_transcribe_wav2vec2 = gr.Interface(
    fn=return_prediction_w2v2_mic_with_device,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=[gr.Textbox(label="Транскрипција"), gr.File(label="Зачувај го транскриптот", file_count="single")],
    allow_flagging="never",
    live=True
)

file_transcribe_wav2vec2 = gr.Interface(
    fn=return_prediction_w2v2_file_with_device,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=[gr.Textbox(label="Транскрипција"), gr.File(label="Зачувај го транскриптот", file_count="single")],
    allow_flagging="never",
    live=True
)


project_description_header = '''
<div class="header">
    <img src="https://i.ibb.co/hYhkkhg/Buki-logo-1.jpg"
         alt="Bookie logo"
         style="float: right; width: 150px; height: 150px; margin-left: 10px;" />
    
    <h2>Автори:</h2>
    <ol>
        <li>Дејан Порјазовски</li>
        <li>Илина Јакимовска</li>
        <li>Ордан Чукалиев</li>
        <li>Никола Стиков</li>
   
    <h4>Оваа колаборација е дел од активностите на Фондација <a href="https://qantarot.substack.com/about"><strong>КАНТАРОТ</strong></a> и <strong>Центарот за напредни интердисциплинарни истражувања (<a href="https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis">ЦеНИИс</a>)</strong> при УКИМ.</h4>
</div>
'''

project_description_footer = '''
<div class="footer">
    <h2>Во тренирањето на овој модел се употребени податоци од:</h2>
    <ol>
        <li>Дигитален архив за етнолошки и антрополошки ресурси (<a href="https://iea.pmf.ukim.edu.mk/tabs/view/61f236ed7d95176b747c20566ddbda1a">ДАЕАР</a>) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.</li>
        <li>Аудио верзија на меѓународното списание <a href="https://etno.pmf.ukim.mk/index.php/eaz/issue/archive">„ЕтноАнтропоЗум"</a> на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.</li>
        <li>Аудио подкастот <a href="https://obicniluge.mk/episodes/">„Обични луѓе"</a> на Илина Јакимовска</li>
        <li>Научните видеа од серијалот <a href="http://naukazadeca.mk">„Наука за деца"</a>, фондација <a href="https://qantarot.substack.com/">КАНТАРОТ</a></li>
        <li>Македонска верзија на <a href="https://commonvoice.mozilla.org/en/datasets">Mozilla Common Voice</a> (верзија 19.0)</li>
        <li>Наставничката Валентина Степановска-Андонова од училиштето Даме Груев во Битола и нејзините ученици Ана Ванчевска, Драган Трајковски и Леона Аземовска.</li>
        <li>Учениците од Меѓународното училиште НОВА</li>
        <li>Радиолозите од болницата 8 Септември, предводени од Димитар Вељановски</li>
        <li>Дамјан Божиноски</li>
        <li>Иван Митревски</li>
        <li>Илија Глигоров</li>
        <li><a href="https://mirovnaakcija.org">Мировна Акција</a></li>
        <li><a href="https://sdk.mk/index.php/category/sakam_da_kazam/">Сакам да кажам</a></li>
        <li><a href="https://vidivaka.mk">Види Вака</a></li>
        <li><a href="https://www.tiktakaudio.com">ТикТак аудио</a></li>
    </ol>
</div>
'''

css = """
.gradio-container {
    background-color: #f3f3f3 !important;
    display: flex;
    flex-direction: column;
}
.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a, .custom-markdown strong {
    font-size: 15px !important;
    font-family: Arial, sans-serif !important;
    color: black !important;  /* Ensure text is black */
}
button {
    color: orange !important;
}
.header {
    order: 1;
    margin-bottom: 20px;
}
.main-content {
    order: 2;
}
.footer {
    order: 3;
    margin-top: 20px;
}
.footer h2, .footer li, strong {
    color: black !important;  /* Ensure footer text is also black */
}

.header h2, .header h4, .header li, strong {
    color: black !important;  /* Ensure footer text is also black */
}
"""


transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
    
with transcriber_app:
    state = gr.State()
    # gr.HTML('<img src="https://i.ibb.co/hYhkkhg/Buki-logo-1.jpg" alt="Bookie logo" style="float: right; width: 150px; height: 150px; margin-left: 10px;" />')
    gr.HTML(project_description_header)
    
    gr.TabbedInterface(
        [mic_transcribe_wav2vec2, file_transcribe_wav2vec2],
        ["Буки-w2v2 транскрипција од микрофон", "Буки-w2v2 транскрипција од фајл"],
    )
    
    gr.HTML(project_description_footer)
    
    state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))

    transcriber_app.unload(return_prediction_w2v2_mic_with_device)
    transcriber_app.unload(return_prediction_w2v2_file_with_device)


# transcriber_app.launch(debug=True, share=True, ssl_verify=False)
if __name__ == "__main__":
    transcriber_app.queue()
    transcriber_app.launch()