Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,788 Bytes
5d52c32 6c226f9 8e787d3 6c226f9 d790c0b 88183ad 6c226f9 a8d52d3 9d6fa91 66efbc3 d790c0b 6c226f9 3d0dfdb abba8cd 4c37d8b abba8cd 857ff61 abba8cd 3c0cd8e 6c226f9 3d0dfdb 6c226f9 abba8cd 6722e17 abba8cd 56f07a2 a3abe32 abba8cd 6722e17 abba8cd 6c226f9 d790c0b 3c0cd8e d790c0b 3c0cd8e d790c0b 3c0cd8e d790c0b 3c0cd8e d790c0b 3c0cd8e d790c0b 3c0cd8e d790c0b abba8cd 29f2174 66efbc3 d790c0b abba8cd 6c226f9 de528b8 6c226f9 47407ef 6c226f9 29f2174 3c0cd8e 03ca7c2 77c67ca 29f2174 3c0cd8e 29f2174 6c226f9 de528b8 29f2174 6c226f9 b95b5ca 6c226f9 7097513 29f2174 7097513 de528b8 29f2174 6c226f9 b95b5ca 3d0dfdb 6c226f9 3c0cd8e 6c226f9 47407ef 7097513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
MODEL_NAME = "TalTechNLP/whisper-large-v3-turbo-et-subs"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def convert_to_vtt(whisper_output):
"""
Convert Whisper ASR output to VTT subtitle format.
Args:
whisper_output (dict): Dictionary containing Whisper ASR output with 'text' and 'chunks'
Returns:
str: VTT formatted subtitles as a string
"""
def format_timestamp(seconds):
"""Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
if seconds is None:
return "99:59:59.999" # Use max time for None values
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds_remainder = seconds % 60
return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')
# Start with VTT header
vtt_output = "WEBVTT\n\n"
# Process each chunk
for i, chunk in enumerate(whisper_output['chunks'], 1):
start_time, end_time = chunk['timestamp']
# Format the subtitle entry
vtt_output += f"{i}\n"
vtt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
vtt_output += f"{chunk['text'].strip()}\n\n"
return vtt_output
def dynamic_gpu_duration(func, duration, *args):
@spaces.GPU(duration=duration)
def wrapped_func():
return func(*args)
return wrapped_func()
@spaces.GPU
def dummy_gpu():
return None
def do_transcribe(inputs):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
return convert_to_vtt(result)
def transcribe(file_path):
with open(file_path, "rb") as f:
audio_data = ffmpeg_read(f.read(), 16000)
# Calculate the length in seconds
audio_length = len(audio_data) / 16000
#expected_transcribe_duration = max(59, int(audio_length / 5.0))
expected_transcribe_duration = 59
gr.Info(f"Starting to transcribe, requesting a GPU for {expected_transcribe_duration} seconds")
return dynamic_gpu_duration(do_transcribe, expected_transcribe_duration, file_path)
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration_string"]
file_h_m_s = file_length.split(":")
file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
if len(file_h_m_s) == 1:
file_h_m_s.insert(0, 0)
if len(file_h_m_s) == 2:
file_h_m_s.insert(0, 0)
file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
if file_length_s > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([yt_url])
except youtube_dl.utils.ExtractorError as err:
raise gr.Error(str(err))
def yt_transcribe(yt_url, max_filesize=75.0):
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
text = transcribe(transcribe, filepath)
return text
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath")
],
#outputs="text",
outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
title="Generate Estonian subtitles",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
f" checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file")
],
#outputs="text",
outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
title="Generate Estonian subtitles",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
f" checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
],
#outputs=["html", "text"],
outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
title="Generate Estonian subtitles",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
f" [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length. NB! YouTube seems to often block download requests from Huggingface and there is nothing we can do about it."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
demo.queue().launch(ssr_mode=False)
|