rafaaa2105 commited on
Commit
9dc238a
·
verified ·
1 Parent(s): 0c33be9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -112
app.py CHANGED
@@ -3,6 +3,7 @@ import moviepy.editor as mp
3
  from moviepy.video.tools.subtitles import SubtitlesClip
4
  from datetime import timedelta
5
  import os
 
6
  from transformers import (
7
  AutoModelForSpeechSeq2Seq,
8
  AutoProcessor,
@@ -15,6 +16,17 @@ import numpy as np
15
  from pydub import AudioSegment
16
  import spaces
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Dictionary of supported languages and their codes for MarianMT
19
  LANGUAGE_CODES = {
20
  "English": "en",
@@ -31,6 +43,7 @@ LANGUAGE_CODES = {
31
 
32
  def get_model_name(source_lang, target_lang):
33
  """Get MarianMT model name for language pair"""
 
34
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
35
 
36
  def format_timestamp(seconds):
@@ -45,51 +58,67 @@ def format_timestamp(seconds):
45
  def translate_text(text, source_lang, target_lang):
46
  """Translate text using MarianMT"""
47
  if source_lang == target_lang:
 
48
  return text
49
 
50
  try:
 
51
  model_name = get_model_name(source_lang, target_lang)
 
52
  tokenizer = MarianTokenizer.from_pretrained(model_name)
53
  model = MarianMTModel.from_pretrained(model_name)
54
 
 
55
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
56
  translated = model.generate(**inputs)
57
  translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
 
58
 
59
  return translated_text
60
  except Exception as e:
61
- print(f"Translation error: {e}")
62
  return text
63
 
64
  def load_audio(video_path):
65
  """Extract and load audio from video file"""
66
- video = mp.VideoFileClip(video_path)
67
- temp_audio_path = "temp_audio.wav"
68
- video.audio.write_audiofile(temp_audio_path)
69
-
70
- # Load audio using pydub
71
- audio = AudioSegment.from_wav(temp_audio_path)
72
- audio_array = np.array(audio.get_array_of_samples())
73
-
74
- # Convert to float32 and normalize
75
- audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max
76
-
77
- # If stereo, convert to mono
78
- if len(audio_array.shape) > 1:
79
- audio_array = audio_array.mean(axis=1)
80
-
81
- return audio_array, audio.frame_rate, video, temp_audio_path
 
 
 
 
 
 
 
 
 
82
 
83
  def create_srt(segments, target_lang="en"):
84
  """Convert transcribed segments to SRT format with optional translation"""
 
85
  srt_content = ""
86
  for i, segment in enumerate(segments, start=1):
87
  start_time = format_timestamp(segment['start'])
88
  end_time = format_timestamp(segment['end'])
89
  text = segment['text'].strip()
90
 
91
- # Translate if target language is different
92
  if segment.get('language') and segment['language'] != target_lang:
 
93
  text = translate_text(text, segment['language'], target_lang)
94
 
95
  srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
@@ -97,128 +126,164 @@ def create_srt(segments, target_lang="en"):
97
 
98
  def create_subtitle_clips(segments, videosize, target_lang="en"):
99
  """Create subtitle clips for moviepy with translation support"""
 
100
  subtitle_clips = []
101
 
102
- for segment in segments:
 
103
  start_time = segment['start']
104
  end_time = segment['end']
105
  duration = end_time - start_time
106
  text = segment['text'].strip()
107
 
108
- # Translate if target language is different
109
  if segment.get('language') and segment['language'] != target_lang:
 
110
  text = translate_text(text, segment['language'], target_lang)
111
 
112
- text_clip = mp.TextClip(
113
- text,
114
- font='Arial',
115
- fontsize=24,
116
- color='white',
117
- stroke_color='black',
118
- stroke_width=1,
119
- size=videosize,
120
- method='caption'
121
- ).set_position(('center', 'bottom'))
122
-
123
- text_clip = text_clip.set_start(start_time).set_duration(duration)
124
- subtitle_clips.append(text_clip)
 
 
 
125
 
126
  return subtitle_clips
127
 
128
  @spaces.GPU
129
  def process_video(video_path, target_lang="en"):
130
  """Main function to process video and add subtitles with translation"""
131
- # Load CrisperWhisper model and processor
132
- device = "cuda" if torch.cuda.is_available() else "cpu"
133
- model_id = "nyrahealth/CrisperWhisper"
134
 
135
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
136
- model_id,
137
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
138
- low_cpu_mem_usage=True,
139
- use_safetensors=True
140
- ).to(device)
141
-
142
- processor = AutoProcessor.from_pretrained(model_id)
143
-
144
- # Load audio and video
145
- audio_array, sampling_rate, video, temp_audio_path = load_audio(video_path)
146
-
147
- # Create pipeline
148
- pipe = pipeline(
149
- "automatic-speech-recognition",
150
- model=model,
151
- tokenizer=processor.tokenizer,
152
- feature_extractor=processor.feature_extractor,
153
- max_new_tokens=128,
154
- chunk_length_s=30,
155
- batch_size=16,
156
- return_timestamps=True,
157
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
158
- device=device,
159
- )
160
-
161
- # Transcribe audio
162
- result = pipe(audio_array, return_timestamps="word")
163
-
164
- # Convert word-level timestamps to segments
165
- segments = []
166
- current_segment = {"text": "", "start": result["chunks"][0]["timestamp"][0]}
167
-
168
- for chunk in result["chunks"]:
169
- current_segment["text"] += " " + chunk["text"]
170
- current_segment["end"] = chunk["timestamp"][1]
 
171
 
172
- # Start new segment if text is long enough or enough time has passed
173
- if len(current_segment["text"].split()) > 10 or \
174
- (current_segment["end"] - current_segment["start"]) > 5.0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  segments.append(current_segment)
176
- if chunk != result["chunks"][-1]: # If not the last chunk
177
- current_segment = {"text": "", "start": chunk["timestamp"][1]}
178
-
179
- # Add last segment if not empty
180
- if current_segment["text"]:
181
- segments.append(current_segment)
182
-
183
- # Add detected language to segments
184
- detected_language = "en" # CrisperWhisper is English-focused
185
- for segment in segments:
186
- segment['language'] = detected_language
187
-
188
- # Create SRT content
189
- srt_content = create_srt(segments, target_lang)
190
-
191
- # Save SRT file
192
- video_name = os.path.splitext(os.path.basename(video_path))[0]
193
- srt_path = f"{video_name}_subtitles_{target_lang}.srt"
194
- with open(srt_path, "w", encoding="utf-8") as f:
195
- f.write(srt_content)
196
-
197
- # Create subtitle clips
198
- subtitle_clips = create_subtitle_clips(segments, video.size, target_lang)
199
-
200
- # Combine video with subtitles
201
- final_video = mp.CompositeVideoClip([video] + subtitle_clips)
202
-
203
- # Save final video
204
- output_video_path = f"{video_name}_with_subtitles_{target_lang}.mp4"
205
- final_video.write_videofile(output_video_path)
206
-
207
- # Clean up
208
- os.remove(temp_audio_path)
209
- video.close()
210
- final_video.close()
211
-
212
- return output_video_path, srt_path
 
 
 
 
 
 
 
213
 
214
  def gradio_interface(video_file, target_language):
215
  """Gradio interface function with language selection"""
216
  try:
 
 
 
217
  video_path = video_file.name
218
  target_lang = LANGUAGE_CODES[target_language]
219
  output_video, srt_file = process_video(video_path, target_lang)
 
 
220
  return output_video, srt_file
221
  except Exception as e:
 
222
  return str(e), None
223
 
224
  # Create Gradio interface
@@ -241,4 +306,5 @@ iface = gr.Interface(
241
  )
242
 
243
  if __name__ == "__main__":
 
244
  iface.launch()
 
3
  from moviepy.video.tools.subtitles import SubtitlesClip
4
  from datetime import timedelta
5
  import os
6
+ import logging
7
  from transformers import (
8
  AutoModelForSpeechSeq2Seq,
9
  AutoProcessor,
 
16
  from pydub import AudioSegment
17
  import spaces
18
 
19
+ # Set up logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ handlers=[
24
+ logging.FileHandler('video_subtitler.log'),
25
+ logging.StreamHandler()
26
+ ]
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
  # Dictionary of supported languages and their codes for MarianMT
31
  LANGUAGE_CODES = {
32
  "English": "en",
 
43
 
44
  def get_model_name(source_lang, target_lang):
45
  """Get MarianMT model name for language pair"""
46
+ logger.info(f"Getting model name for translation from {source_lang} to {target_lang}")
47
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
48
 
49
  def format_timestamp(seconds):
 
58
  def translate_text(text, source_lang, target_lang):
59
  """Translate text using MarianMT"""
60
  if source_lang == target_lang:
61
+ logger.info("Source and target languages are the same, skipping translation")
62
  return text
63
 
64
  try:
65
+ logger.info(f"Translating text from {source_lang} to {target_lang}")
66
  model_name = get_model_name(source_lang, target_lang)
67
+ logger.info(f"Loading translation model: {model_name}")
68
  tokenizer = MarianTokenizer.from_pretrained(model_name)
69
  model = MarianMTModel.from_pretrained(model_name)
70
 
71
+ logger.debug(f"Input text: {text}")
72
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
73
  translated = model.generate(**inputs)
74
  translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
75
+ logger.debug(f"Translated text: {translated_text}")
76
 
77
  return translated_text
78
  except Exception as e:
79
+ logger.error(f"Translation error: {str(e)}", exc_info=True)
80
  return text
81
 
82
  def load_audio(video_path):
83
  """Extract and load audio from video file"""
84
+ logger.info(f"Loading audio from video: {video_path}")
85
+ try:
86
+ video = mp.VideoFileClip(video_path)
87
+ logger.info(f"Video loaded. Duration: {video.duration} seconds")
88
+
89
+ temp_audio_path = "temp_audio.wav"
90
+ logger.info(f"Extracting audio to temporary file: {temp_audio_path}")
91
+ video.audio.write_audiofile(temp_audio_path)
92
+
93
+ logger.info("Loading audio file with pydub")
94
+ audio = AudioSegment.from_wav(temp_audio_path)
95
+ audio_array = np.array(audio.get_array_of_samples())
96
+
97
+ logger.info("Converting audio to float32 and normalizing")
98
+ audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max
99
+
100
+ if len(audio_array.shape) > 1:
101
+ logger.info("Converting stereo to mono")
102
+ audio_array = audio_array.mean(axis=1)
103
+
104
+ logger.info(f"Audio loaded successfully. Shape: {audio_array.shape}, Sample rate: {audio.frame_rate}")
105
+ return audio_array, audio.frame_rate, video, temp_audio_path
106
+ except Exception as e:
107
+ logger.error(f"Error loading audio: {str(e)}", exc_info=True)
108
+ raise
109
 
110
  def create_srt(segments, target_lang="en"):
111
  """Convert transcribed segments to SRT format with optional translation"""
112
+ logger.info(f"Creating SRT content for {len(segments)} segments")
113
  srt_content = ""
114
  for i, segment in enumerate(segments, start=1):
115
  start_time = format_timestamp(segment['start'])
116
  end_time = format_timestamp(segment['end'])
117
  text = segment['text'].strip()
118
 
119
+ logger.debug(f"Processing segment {i}: {start_time} --> {end_time}")
120
  if segment.get('language') and segment['language'] != target_lang:
121
+ logger.info(f"Translating segment {i}")
122
  text = translate_text(text, segment['language'], target_lang)
123
 
124
  srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
 
126
 
127
  def create_subtitle_clips(segments, videosize, target_lang="en"):
128
  """Create subtitle clips for moviepy with translation support"""
129
+ logger.info(f"Creating subtitle clips for {len(segments)} segments")
130
  subtitle_clips = []
131
 
132
+ for i, segment in enumerate(segments):
133
+ logger.debug(f"Processing subtitle clip {i}")
134
  start_time = segment['start']
135
  end_time = segment['end']
136
  duration = end_time - start_time
137
  text = segment['text'].strip()
138
 
 
139
  if segment.get('language') and segment['language'] != target_lang:
140
+ logger.info(f"Translating subtitle {i}")
141
  text = translate_text(text, segment['language'], target_lang)
142
 
143
+ try:
144
+ text_clip = mp.TextClip(
145
+ text,
146
+ font='Arial',
147
+ fontsize=24,
148
+ color='white',
149
+ stroke_color='black',
150
+ stroke_width=1,
151
+ size=videosize,
152
+ method='caption'
153
+ ).set_position(('center', 'bottom'))
154
+
155
+ text_clip = text_clip.set_start(start_time).set_duration(duration)
156
+ subtitle_clips.append(text_clip)
157
+ except Exception as e:
158
+ logger.error(f"Error creating subtitle clip {i}: {str(e)}", exc_info=True)
159
 
160
  return subtitle_clips
161
 
162
  @spaces.GPU
163
  def process_video(video_path, target_lang="en"):
164
  """Main function to process video and add subtitles with translation"""
165
+ logger.info(f"Starting video processing: {video_path}")
 
 
166
 
167
+ try:
168
+ # Set up device
169
+ device = "cuda" if torch.cuda.is_available() else "cpu"
170
+ logger.info(f"Using device: {device}")
171
+
172
+ # Load CrisperWhisper model
173
+ model_id = "nyrahealth/CrisperWhisper"
174
+ logger.info(f"Loading CrisperWhisper model: {model_id}")
175
+
176
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
177
+ model_id,
178
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
179
+ low_cpu_mem_usage=True,
180
+ use_safetensors=True
181
+ ).to(device)
182
+
183
+ logger.info("Loading processor")
184
+ processor = AutoProcessor.from_pretrained(model_id)
185
+
186
+ # Load audio and video
187
+ logger.info("Loading audio from video")
188
+ audio_array, sampling_rate, video, temp_audio_path = load_audio(video_path)
189
+
190
+ # Create pipeline
191
+ logger.info("Creating ASR pipeline")
192
+ pipe = pipeline(
193
+ "automatic-speech-recognition",
194
+ model=model,
195
+ tokenizer=processor.tokenizer,
196
+ feature_extractor=processor.feature_extractor,
197
+ max_new_tokens=128,
198
+ chunk_length_s=30,
199
+ batch_size=16,
200
+ return_timestamps=True,
201
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
202
+ device=device,
203
+ )
204
 
205
+ # Transcribe audio
206
+ logger.info("Starting transcription")
207
+ result = pipe(audio_array, return_timestamps="word")
208
+ logger.info("Transcription completed")
209
+ logger.debug(f"Transcription result: {result}")
210
+
211
+ # Convert word-level timestamps to segments
212
+ logger.info("Converting word-level timestamps to segments")
213
+ segments = []
214
+ current_segment = {"text": "", "start": result["chunks"][0]["timestamp"][0]}
215
+
216
+ for chunk in result["chunks"]:
217
+ current_segment["text"] += " " + chunk["text"]
218
+ current_segment["end"] = chunk["timestamp"][1]
219
+
220
+ if len(current_segment["text"].split()) > 10 or \
221
+ (current_segment["end"] - current_segment["start"]) > 5.0:
222
+ segments.append(current_segment)
223
+ if chunk != result["chunks"][-1]:
224
+ current_segment = {"text": "", "start": chunk["timestamp"][1]}
225
+
226
+ if current_segment["text"]:
227
  segments.append(current_segment)
228
+
229
+ logger.info(f"Created {len(segments)} segments")
230
+
231
+ # Add detected language
232
+ detected_language = "en"
233
+ for segment in segments:
234
+ segment['language'] = detected_language
235
+
236
+ # Create SRT content
237
+ logger.info("Creating SRT content")
238
+ srt_content = create_srt(segments, target_lang)
239
+
240
+ # Save SRT file
241
+ video_name = os.path.splitext(os.path.basename(video_path))[0]
242
+ srt_path = f"{video_name}_subtitles_{target_lang}.srt"
243
+ logger.info(f"Saving SRT file: {srt_path}")
244
+ with open(srt_path, "w", encoding="utf-8") as f:
245
+ f.write(srt_content)
246
+
247
+ # Create subtitle clips
248
+ logger.info("Creating subtitle clips")
249
+ subtitle_clips = create_subtitle_clips(segments, video.size, target_lang)
250
+
251
+ # Combine video with subtitles
252
+ logger.info("Combining video with subtitles")
253
+ final_video = mp.CompositeVideoClip([video] + subtitle_clips)
254
+
255
+ # Save final video
256
+ output_video_path = f"{video_name}_with_subtitles_{target_lang}.mp4"
257
+ logger.info(f"Saving final video: {output_video_path}")
258
+ final_video.write_videofile(output_video_path)
259
+
260
+ # Clean up
261
+ logger.info("Cleaning up temporary files")
262
+ os.remove(temp_audio_path)
263
+ video.close()
264
+ final_video.close()
265
+
266
+ logger.info("Video processing completed successfully")
267
+ return output_video_path, srt_path
268
+
269
+ except Exception as e:
270
+ logger.error(f"Error in video processing: {str(e)}", exc_info=True)
271
+ raise
272
 
273
  def gradio_interface(video_file, target_language):
274
  """Gradio interface function with language selection"""
275
  try:
276
+ logger.info(f"Processing new video request: {video_file.name}")
277
+ logger.info(f"Target language: {target_language}")
278
+
279
  video_path = video_file.name
280
  target_lang = LANGUAGE_CODES[target_language]
281
  output_video, srt_file = process_video(video_path, target_lang)
282
+
283
+ logger.info("Processing completed successfully")
284
  return output_video, srt_file
285
  except Exception as e:
286
+ logger.error(f"Error in Gradio interface: {str(e)}", exc_info=True)
287
  return str(e), None
288
 
289
  # Create Gradio interface
 
306
  )
307
 
308
  if __name__ == "__main__":
309
+ logger.info("Starting Video Subtitler application")
310
  iface.launch()