Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -24,13 +24,47 @@ pipe = pipeline(
|
|
24 |
)
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
@spaces.GPU
|
28 |
def transcribe(inputs):
|
29 |
if inputs is None:
|
30 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
31 |
|
32 |
-
|
33 |
-
return
|
34 |
|
35 |
|
36 |
def _return_yt_html_embed(yt_url):
|
@@ -85,7 +119,8 @@ def yt_transcribe(yt_url, max_filesize=75.0):
|
|
85 |
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
86 |
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
87 |
|
88 |
-
|
|
|
89 |
|
90 |
return html_embed_str, text
|
91 |
|
@@ -132,7 +167,7 @@ yt_transcribe = gr.Interface(
|
|
132 |
description=(
|
133 |
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
|
134 |
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
|
135 |
-
" arbitrary length."
|
136 |
),
|
137 |
allow_flagging="never",
|
138 |
)
|
|
|
24 |
)
|
25 |
|
26 |
|
27 |
+
def convert_to_vtt(whisper_output):
|
28 |
+
"""
|
29 |
+
Convert Whisper ASR output to VTT subtitle format.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
whisper_output (dict): Dictionary containing Whisper ASR output with 'text' and 'chunks'
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
str: VTT formatted subtitles as a string
|
36 |
+
"""
|
37 |
+
def format_timestamp(seconds):
|
38 |
+
"""Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
|
39 |
+
if seconds is None:
|
40 |
+
return "99:59:59.999" # Use max time for None values
|
41 |
+
|
42 |
+
hours = int(seconds // 3600)
|
43 |
+
minutes = int((seconds % 3600) // 60)
|
44 |
+
seconds_remainder = seconds % 60
|
45 |
+
return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')
|
46 |
+
|
47 |
+
# Start with VTT header
|
48 |
+
vtt_output = "WEBVTT\n\n"
|
49 |
+
|
50 |
+
# Process each chunk
|
51 |
+
for i, chunk in enumerate(whisper_output['chunks'], 1):
|
52 |
+
start_time, end_time = chunk['timestamp']
|
53 |
+
|
54 |
+
# Format the subtitle entry
|
55 |
+
vtt_output += f"{i}\n"
|
56 |
+
vtt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
|
57 |
+
vtt_output += f"{chunk['text'].strip()}\n\n"
|
58 |
+
|
59 |
+
return vtt_output
|
60 |
+
|
61 |
@spaces.GPU
|
62 |
def transcribe(inputs):
|
63 |
if inputs is None:
|
64 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
65 |
|
66 |
+
result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
|
67 |
+
return convert_to_vtt(result)
|
68 |
|
69 |
|
70 |
def _return_yt_html_embed(yt_url):
|
|
|
119 |
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
120 |
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
121 |
|
122 |
+
result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
|
123 |
+
text = convert_to_vtt(result)
|
124 |
|
125 |
return html_embed_str, text
|
126 |
|
|
|
167 |
description=(
|
168 |
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
|
169 |
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
|
170 |
+
" arbitrary length. NB! YouTube seems to often block download requests from Huggingface and there is nothing we can do about it."
|
171 |
),
|
172 |
allow_flagging="never",
|
173 |
)
|