from transformers import pipeline, Wav2Vec2ProcessorWithLM from librosa import to_mono, resample import numpy as np import gradio as gr DESC = """\ Ukrainian speech recognition app/ Розпізнавання голосу для української мови """ model_id = "arampacha/wav2vec2-xls-r-1b-uk" processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_id) asr = pipeline( "automatic-speech-recognition", model=model_id, device=-1, feature_extractor=processor.feature_extractor, decoder=processor.decoder ) def run_asr(audio): sr, audio_array = audio audio_array = audio_array.astype(np.float32) if len(audio_array.shape) > 1: if audio_array.shape[1] == 1: audio_array = audio_array.squeeze() elif audio_array.shape[1] == 2: audio_array = to_mono(audio_array.T) else: raise ValueError("Audio with > 2 channels not supported") if sr != 16_000: audio_array = resample(audio_array, orig_sr=sr, target_sr=16_000) res = asr(audio_array, chunk_length_s=20, stride_length_s=2) return res["text"] text_out = gr.outputs.Textbox(label="transcript") interface = gr.Interface( run_asr, "microphone", text_out, layout="horizontal", theme="huggingface", title="Speech-to-text Ukrainian", description=DESC, flagging_options=["incorrect"], examples=["examples/dobryi_ranok.wav"] ) interface.launch(debug=True)