File size: 2,045 Bytes
7754ed5 8c140fb 912008d 6422215 8c140fb 09b9573 912008d 7d4a692 912008d 09b9573 912008d 7d4a692 8c140fb 912008d 7d4a692 912008d 8c140fb 7d4a692 8c140fb 7d4a692 7754ed5 7d4a692 6422215 7d4a692 6422215 7d4a692 6422215 7d4a692 6422215 7d4a692 42de01f 6422215 42de01f 7d4a692 6422215 7d4a692 42de01f 7d4a692 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import gradio as gr
import torch
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1
# Load the ASR model and processor
model_id = "riteshkr/quantized-whisper-large-v3"
model = WhisperForConditionalGeneration.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)
# Set the language to English using forced_decoder_ids
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
# Create the pipeline, explicitly setting the tokenizer and feature extractor
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer, # Use the processor's tokenizer
feature_extractor=processor.feature_extractor, # Use the processor's feature extractor
device=device
)
# Define the transcription function
def transcribe_speech(filepath):
batch_size = 16 if torch.cuda.is_available() else 4
output = pipe(
filepath,
max_new_tokens=256,
generate_kwargs={
"forced_decoder_ids": forced_decoder_ids, # Set language through forced_decoder_ids
},
chunk_length_s=30,
batch_size=batch_size, # Dynamic batch size
)
return output["text"]
# Define the Gradio interface for microphone input
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(),
)
# Define the Gradio interface for file upload input
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(),
)
# Creating the tabbed layout using Blocks
demo = gr.Blocks()
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"],
)
# Launch the app with debugging enabled
if __name__ == "__main__":
demo.launch(debug=True, share=True)
|