riteshkr's picture
Update app.py
09b9573 verified
raw
history blame
2.05 kB
import gradio as gr
import torch
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1
# Load the ASR model and processor
model_id = "riteshkr/quantized-whisper-large-v3"
model = WhisperForConditionalGeneration.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)
# Set the language to English using forced_decoder_ids
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
# Create the pipeline, explicitly setting the tokenizer and feature extractor
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer, # Use the processor's tokenizer
feature_extractor=processor.feature_extractor, # Use the processor's feature extractor
device=device
)
# Define the transcription function
def transcribe_speech(filepath):
batch_size = 16 if torch.cuda.is_available() else 4
output = pipe(
filepath,
max_new_tokens=256,
generate_kwargs={
"forced_decoder_ids": forced_decoder_ids, # Set language through forced_decoder_ids
},
chunk_length_s=30,
batch_size=batch_size, # Dynamic batch size
)
return output["text"]
# Define the Gradio interface for microphone input
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(),
)
# Define the Gradio interface for file upload input
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(),
)
# Creating the tabbed layout using Blocks
demo = gr.Blocks()
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"],
)
# Launch the app with debugging enabled
if __name__ == "__main__":
demo.launch(debug=True, share=True)