Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,681 Bytes
5d52c32 6c226f9 d790c0b 88183ad 36918ca ba5b26a 6c226f9 475bd7b 2362603 9d6fa91 66efbc3 d790c0b 6c226f9 475bd7b 6c226f9 ba5b26a c7df40e 5d52c32 ba5b26a 80b1cb0 3c0cd8e 410b6ca ba5b26a 041a283 ba5b26a c7df40e ba5b26a c7df40e ba5b26a c7df40e ba5b26a 041a283 ba5b26a 041a283 ba5b26a 041a283 ba5b26a 041a283 ba5b26a 041a283 ba5b26a 041a283 6c226f9 c7df40e 475bd7b 3c0cd8e 3ce82e9 cf5f635 6c226f9 041a283 410b6ca 6c226f9 24c2c97 6c226f9 24c2c97 6c226f9 475bd7b 6c226f9 475bd7b 6c226f9 475bd7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import spaces
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
import json
from pydub import AudioSegment
import math
#===============
# Define main parameters
#===============
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
#===============
# Main functions
#===============
# Function to split the audio into chunks
def split_audio(audio_file, chunk_length_ms):
audio = AudioSegment.from_file(audio_file)
duration_ms = len(audio)
num_chunks = math.ceil(duration_ms / chunk_length_ms)
chunks = []
for i in range(num_chunks):
start_time = i * chunk_length_ms
end_time = min((i + 1) * chunk_length_ms, duration_ms)
chunk = audio[start_time:end_time]
chunk_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
chunk.export(chunk_file.name, format="wav")
chunks.append((chunk_file.name, start_time)) # Save the chunk and its start time
return chunks
@spaces.GPU
def transcribe(audio_file, task, language, keywords, chunk_length_s=30):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
# Load the audio file using pydub to get its length
audio = AudioSegment.from_file(audio_file)
audio_length_ms = len(audio) # Length in milliseconds
# Set threshold for chunking (40 minutes = 2,400,000 milliseconds)
chunk_threshold_ms = 40 * 60 * 1000
# Decide whether to chunk or process the entire file
if audio_length_ms > chunk_threshold_ms:
# Audio is longer than 40 minutes, apply chunking
chunk_length_ms = chunk_length_s * 1000
audio_chunks = split_audio(audio_file, chunk_length_ms)
all_text = ""
all_timestamps = []
for chunk_file, chunk_start_time in audio_chunks:
result = pipe(chunk_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": language}, return_timestamps=True)
all_text += result["text"] + " "
# Adjust the timestamps to account for the chunk's position in the full audio
for chunk_timestamp in result["chunks"]:
start_time_chunk, end_time_chunk = chunk_timestamp["timestamp"]
adjusted_timestamp = {
"start": start_time_chunk + chunk_start_time / 1000,
"end": end_time_chunk + chunk_start_time / 1000,
"text": chunk_timestamp["text"]
}
all_timestamps.append(adjusted_timestamp)
else:
# Audio is shorter than 40 minutes, process the whole file at once
result = pipe(audio_file, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": language}, return_timestamps=True)
all_text = result["text"]
all_timestamps = []
for chunk_timestamp in result["chunks"]:
start_time_chunk, end_time_chunk = chunk_timestamp["timestamp"]
adjusted_timestamp = {
"start": start_time_chunk,
"end": end_time_chunk,
"text": chunk_timestamp["text"]
}
all_timestamps.append(adjusted_timestamp)
# First 200 characters for display
preview_text = all_text[:200] + "..." if len(all_text) > 200 else all_text
# Full transcription with timestamps in JSON
full_transcription = {
"text": all_text,
"timestamps": all_timestamps
}
# Save the full transcription (with timestamps) as JSON
json_file_path = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(audio_file))[0]}_transcription.json")
with open(json_file_path, "w") as json_file:
json.dump(full_transcription, json_file)
# Save the plain text transcription as TXT
txt_file_path = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(audio_file))[0]}_transcription.txt")
with open(txt_file_path, "w") as txt_file:
txt_file.write(all_text)
return preview_text, json_file_path, txt_file_path
#===============
# Build the frontend
#===============
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
gr.Dropdown(["spanish", "english"], label="Language", info="Will add more later!", value="spanish"),
gr.Textbox(lines=10, label="Keywords"),
],
outputs=[
gr.Textbox(label="Preview (first 200 characters)"),
gr.File(label="Download full transcription as JSON"),
gr.File(label="Download transcription as TXT")
],
title="Transcribe Audio",
description=(
"Transcribe audio inputs with the click of a button! Demo uses the"
f" checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of **at most 40 minutes**. Support for longer audio files will soon come."
),
allow_flagging="never",
)
#===============
# Launch
#===============
demo = gr.Blocks(theme=gr.themes.Ocean())
with demo:
gr.TabbedInterface([file_transcribe], ["Audio file"])
demo.queue().launch(ssr_mode=False) |