from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration import gradio as gr import torchaudio mdl = "models/amithm3/whisper-medium" processor = WhisperProcessor.from_pretrained(mdl, task="transcribe") feature_extractor = WhisperFeatureExtractor.from_pretrained(mdl, task="transcribe") model = WhisperForConditionalGeneration.from_pretrained(mdl) sampling_rate = 16000 def transcribe(audio, language): audio, orig_freq = torchaudio.load(audio) audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=sampling_rate) audio = audio.squeeze().numpy() input_features = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features model.generation_config.language = language predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription iface = gr.Interface( fn=transcribe, inputs=[gr.Audio(type="filepath"), gr.Dropdown(["kannada", "english", None], label="Language", value="kannada")], outputs="text", title="Whisper Medium Indic", description="Realtime demo for Indic speech recognition using a fine-tuned Whisper Medium model.", ) iface.launch()