whisper-large-v3-et-subs

Running on Zero

App Files Files Community

Tanel commited on Nov 25, 2024

Commit

3d0dfdb

verified ·

1 Parent(s): 29f2174

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -4

app.py CHANGED Viewed

@@ -24,13 +24,47 @@ pipe = pipeline(
 )
 @spaces.GPU
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)["text"]
-    return  text
 def _return_yt_html_embed(yt_url):
@@ -85,7 +119,8 @@ def yt_transcribe(yt_url, max_filesize=75.0):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe",  "language": "et"}, return_timestamps=True)["text"]
     return html_embed_str, text
@@ -132,7 +167,7 @@ yt_transcribe = gr.Interface(
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
         f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
-        " arbitrary length."
     ),
     allow_flagging="never",
 )

 )
+def convert_to_vtt(whisper_output):
+    """
+    Convert Whisper ASR output to VTT subtitle format.
+    Args:
+        whisper_output (dict): Dictionary containing Whisper ASR output with 'text' and 'chunks'
+    Returns:
+        str: VTT formatted subtitles as a string
+    """
+    def format_timestamp(seconds):
+        """Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
+        if seconds is None:
+            return "99:59:59.999"  # Use max time for None values
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        seconds_remainder = seconds % 60
+        return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')
+    # Start with VTT header
+    vtt_output = "WEBVTT\n\n"
+    # Process each chunk
+    for i, chunk in enumerate(whisper_output['chunks'], 1):
+        start_time, end_time = chunk['timestamp']
+        # Format the subtitle entry
+        vtt_output += f"{i}\n"
+        vtt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
+        vtt_output += f"{chunk['text'].strip()}\n\n"
+    return vtt_output
 @spaces.GPU
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
+    return convert_to_vtt(result)
 def _return_yt_html_embed(yt_url):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe",  "language": "et"}, return_timestamps=True)
+    text = convert_to_vtt(result)
     return html_embed_str, text
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
         f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
+        " arbitrary length. NB! YouTube seems to often block download requests from Huggingface and there is nothing we can do about it."
     ),
     allow_flagging="never",
 )