rafaaa2105 commited on
Commit
63a13ce
·
verified ·
1 Parent(s): 3fb91e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -25
app.py CHANGED
@@ -1,11 +1,18 @@
1
  import gradio as gr
2
- import whisper
3
  import moviepy.editor as mp
4
  from moviepy.video.tools.subtitles import SubtitlesClip
5
  from datetime import timedelta
6
  import os
7
- from transformers import MarianMTModel, MarianTokenizer
 
 
 
 
 
 
8
  import torch
 
 
9
  import spaces
10
 
11
  # Dictionary of supported languages and their codes for MarianMT
@@ -22,8 +29,8 @@ LANGUAGE_CODES = {
22
  "Korean": "ko"
23
  }
24
 
25
- # Mapping of language pairs to MarianMT model names
26
  def get_model_name(source_lang, target_lang):
 
27
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
28
 
29
  def format_timestamp(seconds):
@@ -45,7 +52,6 @@ def translate_text(text, source_lang, target_lang):
45
  tokenizer = MarianTokenizer.from_pretrained(model_name)
46
  model = MarianMTModel.from_pretrained(model_name)
47
 
48
- # Tokenize and translate
49
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
50
  translated = model.generate(**inputs)
51
  translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
@@ -53,10 +59,29 @@ def translate_text(text, source_lang, target_lang):
53
  return translated_text
54
  except Exception as e:
55
  print(f"Translation error: {e}")
56
- return text # Return original text if translation fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def create_srt(segments, target_lang="en"):
59
- """Convert whisper segments to SRT format with optional translation"""
60
  srt_content = ""
61
  for i, segment in enumerate(segments, start=1):
62
  start_time = format_timestamp(segment['start'])
@@ -64,7 +89,7 @@ def create_srt(segments, target_lang="en"):
64
  text = segment['text'].strip()
65
 
66
  # Translate if target language is different
67
- if 'language' in segment and segment['language'] != target_lang:
68
  text = translate_text(text, segment['language'], target_lang)
69
 
70
  srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
@@ -81,7 +106,7 @@ def create_subtitle_clips(segments, videosize, target_lang="en"):
81
  text = segment['text'].strip()
82
 
83
  # Translate if target language is different
84
- if 'language' in segment and segment['language'] != target_lang:
85
  text = translate_text(text, segment['language'], target_lang)
86
 
87
  text_clip = mp.TextClip(
@@ -103,26 +128,65 @@ def create_subtitle_clips(segments, videosize, target_lang="en"):
103
  @spaces.GPU
104
  def process_video(video_path, target_lang="en"):
105
  """Main function to process video and add subtitles with translation"""
106
- # Load Whisper model
107
- model = whisper.load_model("base")
 
108
 
109
- # Extract audio from video
110
- video = mp.VideoFileClip(video_path)
111
- audio = video.audio
 
 
 
112
 
113
- # Save audio temporarily
114
- temp_audio_path = "temp_audio.wav"
115
- audio.write_audiofile(temp_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # Transcribe audio
118
- result = model.transcribe(temp_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  # Add detected language to segments
121
- for segment in result["segments"]:
122
- segment['language'] = result["language"]
 
123
 
124
- # Create SRT content with translation
125
- srt_content = create_srt(result["segments"], target_lang)
126
 
127
  # Save SRT file
128
  video_name = os.path.splitext(os.path.basename(video_path))[0]
@@ -130,8 +194,8 @@ def process_video(video_path, target_lang="en"):
130
  with open(srt_path, "w", encoding="utf-8") as f:
131
  f.write(srt_content)
132
 
133
- # Create subtitle clips with translation
134
- subtitle_clips = create_subtitle_clips(result["segments"], video.size, target_lang)
135
 
136
  # Combine video with subtitles
137
  final_video = mp.CompositeVideoClip([video] + subtitle_clips)
@@ -172,8 +236,8 @@ iface = gr.Interface(
172
  gr.Video(label="Video with Subtitles"),
173
  gr.File(label="SRT Subtitle File")
174
  ],
175
- title="Video Subtitler with Translation",
176
- description="Upload a video to generate subtitles, translate them to your chosen language, and embed them directly in the video."
177
  )
178
 
179
  if __name__ == "__main__":
 
1
  import gradio as gr
 
2
  import moviepy.editor as mp
3
  from moviepy.video.tools.subtitles import SubtitlesClip
4
  from datetime import timedelta
5
  import os
6
+ from transformers import (
7
+ AutoModelForSpeechSeq2Seq,
8
+ AutoProcessor,
9
+ MarianMTModel,
10
+ MarianTokenizer,
11
+ pipeline
12
+ )
13
  import torch
14
+ import numpy as np
15
+ from pydub import AudioSegment
16
  import spaces
17
 
18
  # Dictionary of supported languages and their codes for MarianMT
 
29
  "Korean": "ko"
30
  }
31
 
 
32
  def get_model_name(source_lang, target_lang):
33
+ """Get MarianMT model name for language pair"""
34
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
35
 
36
  def format_timestamp(seconds):
 
52
  tokenizer = MarianTokenizer.from_pretrained(model_name)
53
  model = MarianMTModel.from_pretrained(model_name)
54
 
 
55
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
56
  translated = model.generate(**inputs)
57
  translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
 
59
  return translated_text
60
  except Exception as e:
61
  print(f"Translation error: {e}")
62
+ return text
63
+
64
+ def load_audio(video_path):
65
+ """Extract and load audio from video file"""
66
+ video = mp.VideoFileClip(video_path)
67
+ temp_audio_path = "temp_audio.wav"
68
+ video.audio.write_audiofile(temp_audio_path)
69
+
70
+ # Load audio using pydub
71
+ audio = AudioSegment.from_wav(temp_audio_path)
72
+ audio_array = np.array(audio.get_array_of_samples())
73
+
74
+ # Convert to float32 and normalize
75
+ audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max
76
+
77
+ # If stereo, convert to mono
78
+ if len(audio_array.shape) > 1:
79
+ audio_array = audio_array.mean(axis=1)
80
+
81
+ return audio_array, audio.frame_rate, video, temp_audio_path
82
 
83
  def create_srt(segments, target_lang="en"):
84
+ """Convert transcribed segments to SRT format with optional translation"""
85
  srt_content = ""
86
  for i, segment in enumerate(segments, start=1):
87
  start_time = format_timestamp(segment['start'])
 
89
  text = segment['text'].strip()
90
 
91
  # Translate if target language is different
92
+ if segment.get('language') and segment['language'] != target_lang:
93
  text = translate_text(text, segment['language'], target_lang)
94
 
95
  srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
 
106
  text = segment['text'].strip()
107
 
108
  # Translate if target language is different
109
+ if segment.get('language') and segment['language'] != target_lang:
110
  text = translate_text(text, segment['language'], target_lang)
111
 
112
  text_clip = mp.TextClip(
 
128
  @spaces.GPU
129
  def process_video(video_path, target_lang="en"):
130
  """Main function to process video and add subtitles with translation"""
131
+ # Load CrisperWhisper model and processor
132
+ device = "cuda" if torch.cuda.is_available() else "cpu"
133
+ model_id = "nyrahealth/CrisperWhisper"
134
 
135
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
136
+ model_id,
137
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
138
+ low_cpu_mem_usage=True,
139
+ use_safetensors=True
140
+ ).to(device)
141
 
142
+ processor = AutoProcessor.from_pretrained(model_id)
143
+
144
+ # Load audio and video
145
+ audio_array, sampling_rate, video, temp_audio_path = load_audio(video_path)
146
+
147
+ # Create pipeline
148
+ pipe = pipeline(
149
+ "automatic-speech-recognition",
150
+ model=model,
151
+ tokenizer=processor.tokenizer,
152
+ feature_extractor=processor.feature_extractor,
153
+ max_new_tokens=128,
154
+ chunk_length_s=30,
155
+ batch_size=16,
156
+ return_timestamps=True,
157
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
158
+ device=device,
159
+ )
160
 
161
  # Transcribe audio
162
+ result = pipe(audio_array, return_timestamps="word")
163
+
164
+ # Convert word-level timestamps to segments
165
+ segments = []
166
+ current_segment = {"text": "", "start": result["chunks"][0]["timestamp"][0]}
167
+
168
+ for chunk in result["chunks"]:
169
+ current_segment["text"] += " " + chunk["text"]
170
+ current_segment["end"] = chunk["timestamp"][1]
171
+
172
+ # Start new segment if text is long enough or enough time has passed
173
+ if len(current_segment["text"].split()) > 10 or \
174
+ (current_segment["end"] - current_segment["start"]) > 5.0:
175
+ segments.append(current_segment)
176
+ if chunk != result["chunks"][-1]: # If not the last chunk
177
+ current_segment = {"text": "", "start": chunk["timestamp"][1]}
178
+
179
+ # Add last segment if not empty
180
+ if current_segment["text"]:
181
+ segments.append(current_segment)
182
 
183
  # Add detected language to segments
184
+ detected_language = "en" # CrisperWhisper is English-focused
185
+ for segment in segments:
186
+ segment['language'] = detected_language
187
 
188
+ # Create SRT content
189
+ srt_content = create_srt(segments, target_lang)
190
 
191
  # Save SRT file
192
  video_name = os.path.splitext(os.path.basename(video_path))[0]
 
194
  with open(srt_path, "w", encoding="utf-8") as f:
195
  f.write(srt_content)
196
 
197
+ # Create subtitle clips
198
+ subtitle_clips = create_subtitle_clips(segments, video.size, target_lang)
199
 
200
  # Combine video with subtitles
201
  final_video = mp.CompositeVideoClip([video] + subtitle_clips)
 
236
  gr.Video(label="Video with Subtitles"),
237
  gr.File(label="SRT Subtitle File")
238
  ],
239
+ title="Video Subtitler with CrisperWhisper",
240
+ description="Upload a video to generate subtitles using CrisperWhisper, translate them to your chosen language, and embed them directly in the video."
241
  )
242
 
243
  if __name__ == "__main__":