Tanel commited on
Commit
3d0dfdb
·
verified ·
1 Parent(s): 29f2174

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -4
app.py CHANGED
@@ -24,13 +24,47 @@ pipe = pipeline(
24
  )
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @spaces.GPU
28
  def transcribe(inputs):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
 
32
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)["text"]
33
- return text
34
 
35
 
36
  def _return_yt_html_embed(yt_url):
@@ -85,7 +119,8 @@ def yt_transcribe(yt_url, max_filesize=75.0):
85
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
86
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
87
 
88
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)["text"]
 
89
 
90
  return html_embed_str, text
91
 
@@ -132,7 +167,7 @@ yt_transcribe = gr.Interface(
132
  description=(
133
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
134
  f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
135
- " arbitrary length."
136
  ),
137
  allow_flagging="never",
138
  )
 
24
  )
25
 
26
 
27
+ def convert_to_vtt(whisper_output):
28
+ """
29
+ Convert Whisper ASR output to VTT subtitle format.
30
+
31
+ Args:
32
+ whisper_output (dict): Dictionary containing Whisper ASR output with 'text' and 'chunks'
33
+
34
+ Returns:
35
+ str: VTT formatted subtitles as a string
36
+ """
37
+ def format_timestamp(seconds):
38
+ """Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
39
+ if seconds is None:
40
+ return "99:59:59.999" # Use max time for None values
41
+
42
+ hours = int(seconds // 3600)
43
+ minutes = int((seconds % 3600) // 60)
44
+ seconds_remainder = seconds % 60
45
+ return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')
46
+
47
+ # Start with VTT header
48
+ vtt_output = "WEBVTT\n\n"
49
+
50
+ # Process each chunk
51
+ for i, chunk in enumerate(whisper_output['chunks'], 1):
52
+ start_time, end_time = chunk['timestamp']
53
+
54
+ # Format the subtitle entry
55
+ vtt_output += f"{i}\n"
56
+ vtt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
57
+ vtt_output += f"{chunk['text'].strip()}\n\n"
58
+
59
+ return vtt_output
60
+
61
  @spaces.GPU
62
  def transcribe(inputs):
63
  if inputs is None:
64
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
65
 
66
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
67
+ return convert_to_vtt(result)
68
 
69
 
70
  def _return_yt_html_embed(yt_url):
 
119
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
120
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
121
 
122
+ result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
123
+ text = convert_to_vtt(result)
124
 
125
  return html_embed_str, text
126
 
 
167
  description=(
168
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
169
  f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
170
+ " arbitrary length. NB! YouTube seems to often block download requests from Huggingface and there is nothing we can do about it."
171
  ),
172
  allow_flagging="never",
173
  )