Spaces:
Runtime error
Runtime error
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration | |
import gradio as gr | |
import torchaudio | |
mdl = "models/amithm3/whisper-medium" | |
processor = WhisperProcessor.from_pretrained(mdl, task="transcribe") | |
feature_extractor = WhisperFeatureExtractor.from_pretrained(mdl, task="transcribe") | |
model = WhisperForConditionalGeneration.from_pretrained(mdl) | |
sampling_rate = 16000 | |
def transcribe(audio, language): | |
audio, orig_freq = torchaudio.load(audio) | |
audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=sampling_rate) | |
audio = audio.squeeze().numpy() | |
input_features = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features | |
model.generation_config.language = language | |
predicted_ids = model.generate(input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
return transcription | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[gr.Audio(type="filepath"), gr.Dropdown(["kannada", "english", None], label="Language", value="kannada")], | |
outputs="text", | |
title="Whisper Medium Indic", | |
description="Realtime demo for Indic speech recognition using a fine-tuned Whisper Medium model.", | |
) | |
iface.launch() | |