asr-sq / app.py
kadriu's picture
Update app.py
c43790f verified
from transformers import pipeline
import gradio as gr
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
# Load model and tokenizer
processor = WhisperProcessor.from_pretrained("kadriu/whisper-turbo-sq")
model = WhisperForConditionalGeneration.from_pretrained("kadriu/whisper-turbo-sq")
#tokenizer = processor.tokenizer
# Create pipeline
# pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor)
def transcribe(audio):
audio_input, _ = librosa.load(audio, sr=16000)
input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
text = transcription
#text = pipe(audio)["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="ASR Albanian",
description="Realtime demo for Sq speech recognition",
)
iface.launch(share=True)