File size: 3,555 Bytes
d23e0cd 6422215 d23e0cd 6422215 d23e0cd 6422215 d23e0cd 6422215 d23e0cd 2a603ad 241d467 04dc82f d23e0cd 6422215 d23e0cd 6422215 d23e0cd 2a603ad 241d467 04dc82f d23e0cd 6422215 d23e0cd 6422215 d23e0cd 6422215 d23e0cd 6422215 d23e0cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
# Define model details
MODEL_NAME = "riteshkr/whisper-large-v3-quantized" # Update with your actual model ID
BATCH_SIZE = 8
# Select device based on availability of CUDA (GPU) or fallback to CPU
device = 0 if torch.cuda.is_available() else "cpu"
# Load the ASR model pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30, # Adjust as needed for your application
device=device,
)
# Utility function to format timestamps
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
if seconds is not None:
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
else:
return seconds
# Transcription function for batch processing
def transcribe(files, task, return_timestamps):
transcriptions = []
for file in files: # Process each file in the batch
outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
text = outputs["text"]
if return_timestamps:
timestamps = outputs["chunks"]
formatted_chunks = [
f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
for chunk in timestamps
]
text = "\n".join(formatted_chunks)
transcriptions.append(text)
return "\n\n".join(transcriptions) # Return all transcriptions combined
# Define Gradio interface for microphone input
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task"),
gr.Checkbox(label="Return timestamps"),
],
outputs="text",
layout="horizontal",
title="Whisper Demo: Transcribe Audio",
description=(
f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
),
allow_flagging="never",
)
# Define Gradio interface for file upload
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),
gr.Radio(["transcribe", "translate"], label="Task"),
gr.Checkbox(label="Return timestamps"),
],
outputs="text",
layout="horizontal",
title="Whisper Demo: Transcribe Audio",
description=(
f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
),
allow_flagging="never",
examples=[
["./example.flac", "transcribe", False],
["./example.flac", "transcribe", True],
],
)
# Create the Gradio tabbed interface for switching between modes
demo = gr.Blocks()
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"]
)
# Launch the app
if __name__ == "__main__":
demo.launch(debug=True, enable_queue=True, share=True)
|