File size: 2,045 Bytes
7754ed5
8c140fb
912008d
6422215
8c140fb
 
 
09b9573
912008d
 
 
7d4a692
912008d
 
 
09b9573
 
 
 
 
 
 
 
912008d
 
7d4a692
8c140fb
912008d
7d4a692
 
 
 
912008d
8c140fb
7d4a692
8c140fb
7d4a692
 
7754ed5
7d4a692
6422215
7d4a692
 
 
6422215
 
7d4a692
6422215
7d4a692
 
 
6422215
 
7d4a692
42de01f
 
6422215
 
42de01f
7d4a692
6422215
 
7d4a692
42de01f
7d4a692
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
import torch
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor

# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1

# Load the ASR model and processor
model_id = "riteshkr/quantized-whisper-large-v3"
model = WhisperForConditionalGeneration.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)

# Set the language to English using forced_decoder_ids
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")

# Create the pipeline, explicitly setting the tokenizer and feature extractor
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,  # Use the processor's tokenizer
    feature_extractor=processor.feature_extractor,  # Use the processor's feature extractor
    device=device
)

# Define the transcription function
def transcribe_speech(filepath):
    batch_size = 16 if torch.cuda.is_available() else 4

    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "forced_decoder_ids": forced_decoder_ids,  # Set language through forced_decoder_ids
        },
        chunk_length_s=30,
        batch_size=batch_size,  # Dynamic batch size
    )
    return output["text"]

# Define the Gradio interface for microphone input
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(),
)

# Define the Gradio interface for file upload input
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(),
)

# Creating the tabbed layout using Blocks
demo = gr.Blocks()

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

# Launch the app with debugging enabled
if __name__ == "__main__":
    demo.launch(debug=True, share=True)