import gradio as gr import torch from transformers import pipeline # Check if a GPU is available and set the device device = 0 if torch.cuda.is_available() else -1 # Load the ASR model using the Hugging Face pipeline model_id = "riteshkr/quantized-whisper-large-v3" # Update with your model path or ID pipe = pipeline("automatic-speech-recognition", model=model_id, device=device) # Define the transcription function with batching support def transcribe_speech(filepath): # Adjust batch size based on device (smaller batch for CPU) batch_size = 16 if torch.cuda.is_available() else 4 output = pipe( filepath, max_new_tokens=256, generate_kwargs={ "task": "transcribe", "language": "english", }, chunk_length_s=30, batch_size=batch_size, # Dynamic batch size ) return output["text"] # Define the Gradio interface for microphone input mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Textbox(), ) # Define the Gradio interface for file upload input file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Textbox(), ) # Creating the tabbed layout using Blocks demo = gr.Blocks() with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) # Launch the app with debugging enabled if __name__ == "__main__": demo.launch(debug=True, share=True)