import gradio as gr import torch from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor # Check if a GPU is available and set the device device = 0 if torch.cuda.is_available() else -1 # Load the ASR model and processor model_id = "riteshkr/quantized-whisper-large-v3" model = WhisperForConditionalGeneration.from_pretrained(model_id) processor = WhisperProcessor.from_pretrained(model_id) # Set the language to English using forced_decoder_ids forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe") # Create the pipeline, explicitly setting the tokenizer and feature extractor pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, # Use the processor's tokenizer feature_extractor=processor.feature_extractor, # Use the processor's feature extractor device=device ) # Define the transcription function def transcribe_speech(filepath): batch_size = 16 if torch.cuda.is_available() else 4 output = pipe( filepath, max_new_tokens=256, generate_kwargs={ "forced_decoder_ids": forced_decoder_ids, # Set language through forced_decoder_ids }, chunk_length_s=30, batch_size=batch_size, # Dynamic batch size ) return output["text"] # Define the Gradio interface for microphone input mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Textbox(), ) # Define the Gradio interface for file upload input file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Textbox(), ) # Creating the tabbed layout using Blocks demo = gr.Blocks() with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) # Launch the app with debugging enabled if __name__ == "__main__": demo.launch(debug=True, share=True)