from transformers import pipeline import gradio as gr import librosa import numpy as np from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline # Load model and tokenizer processor = WhisperProcessor.from_pretrained("kadriu/whisper-turbo-sq") model = WhisperForConditionalGeneration.from_pretrained("kadriu/whisper-turbo-sq") #tokenizer = processor.tokenizer # Create pipeline # pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor) def transcribe(audio): audio_input, _ = librosa.load(audio, sr=16000) input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features # generate token ids predicted_ids = model.generate(input_features) # decode token ids to text #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) text = transcription #text = pipe(audio)["text"] return text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="ASR Albanian", description="Realtime demo for Sq speech recognition", ) iface.launch(share=True)