from transformers import pipeline, Wav2Vec2ProcessorWithLM
from librosa import to_mono, resample
import numpy as np
import gradio as gr

DESC = """\
Ukrainian speech recognition app/
Розпізнавання голосу для української мови
"""

model_id = "arampacha/wav2vec2-xls-r-1b-uk"

processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)

asr = pipeline(
    "automatic-speech-recognition", model=model_id, device=-1, 
    feature_extractor=processor.feature_extractor, decoder=processor.decoder
)
def run_asr(audio):
    sr, audio_array = audio
    audio_array = audio_array.astype(np.float32)
    if len(audio_array.shape) > 1:
        if audio_array.shape[1] == 1:
            audio_array = audio_array.squeeze()
        elif audio_array.shape[1] == 2:
            audio_array = to_mono(audio_array.T)
        else:
            raise ValueError("Audio with > 2 channels not supported")
    if sr != 16_000:
        audio_array = resample(audio_array, orig_sr=sr, target_sr=16_000)
    res = asr(audio_array, chunk_length_s=20, stride_length_s=2)
    
    return res["text"]

text_out = gr.outputs.Textbox(label="transcript")
interface = gr.Interface(
    run_asr,
    "microphone",
    text_out,
    layout="horizontal",
    theme="huggingface",
    title="Speech-to-text Ukrainian",
    description=DESC,
    flagging_options=["incorrect"],
    examples=["examples/dobryi_ranok.wav"]
)

interface.launch(debug=True)