File size: 3,555 Bytes
d23e0cd
6422215
d23e0cd
 
6422215
d23e0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6422215
d23e0cd
6422215
d23e0cd
 
2a603ad
241d467
04dc82f
d23e0cd
 
 
 
 
 
 
 
6422215
 
d23e0cd
6422215
d23e0cd
 
2a603ad
241d467
04dc82f
d23e0cd
 
 
 
 
 
 
 
 
 
 
 
6422215
 
d23e0cd
6422215
 
 
 
 
d23e0cd
6422215
 
d23e0cd
6422215
d23e0cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr

# Define model details
MODEL_NAME = "riteshkr/whisper-large-v3-quantized"  # Update with your actual model ID
BATCH_SIZE = 8

# Select device based on availability of CUDA (GPU) or fallback to CPU
device = 0 if torch.cuda.is_available() else "cpu"

# Load the ASR model pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,  # Adjust as needed for your application
    device=device,
)

# Utility function to format timestamps
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
    if seconds is not None:
        milliseconds = round(seconds * 1000.0)
        hours = milliseconds // 3_600_000
        milliseconds -= hours * 3_600_000
        minutes = milliseconds // 60_000
        milliseconds -= minutes * 60_000
        seconds = milliseconds // 1_000
        milliseconds -= seconds * 1_000
        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    else:
        return seconds

# Transcription function for batch processing
def transcribe(files, task, return_timestamps):
    transcriptions = []
    for file in files:  # Process each file in the batch
        outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
        text = outputs["text"]
        if return_timestamps:
            timestamps = outputs["chunks"]
            formatted_chunks = [
                f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
                for chunk in timestamps
            ]
            text = "\n".join(formatted_chunks)
        transcriptions.append(text)
    return "\n\n".join(transcriptions)  # Return all transcriptions combined

# Define Gradio interface for microphone input
mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task"),
        gr.Checkbox(label="Return timestamps"),
    ],
    outputs="text",
    layout="horizontal",
    title="Whisper Demo: Transcribe Audio",
    description=(
        f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
    ),
    allow_flagging="never",
)

# Define Gradio interface for file upload
file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),
        gr.Radio(["transcribe", "translate"], label="Task"),
        gr.Checkbox(label="Return timestamps"),
    ],
    outputs="text",
    layout="horizontal",
    title="Whisper Demo: Transcribe Audio",
    description=(
        f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
    ),
    allow_flagging="never",
    examples=[
        ["./example.flac", "transcribe", False],
        ["./example.flac", "transcribe", True],
    ],
)

# Create the Gradio tabbed interface for switching between modes
demo = gr.Blocks()

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(debug=True, enable_queue=True, share=True)