Spaces:

rafaaa2105
/

subtitles-translation

Running on Zero

App Files Files Community

rafaaa2105 commited on 25 days ago

Commit

63a13ce

verified ·

1 Parent(s): 3fb91e0

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -25

app.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import gradio as gr
-import whisper
 import moviepy.editor as mp
 from moviepy.video.tools.subtitles import SubtitlesClip
 from datetime import timedelta
 import os
-from transformers import MarianMTModel, MarianTokenizer
 import torch
 import spaces
 # Dictionary of supported languages and their codes for MarianMT
@@ -22,8 +29,8 @@ LANGUAGE_CODES = {
     "Korean": "ko"
 }
-# Mapping of language pairs to MarianMT model names
 def get_model_name(source_lang, target_lang):
     return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
 def format_timestamp(seconds):
@@ -45,7 +52,6 @@ def translate_text(text, source_lang, target_lang):
         tokenizer = MarianTokenizer.from_pretrained(model_name)
         model = MarianMTModel.from_pretrained(model_name)
-        # Tokenize and translate
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
         translated = model.generate(**inputs)
         translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
@@ -53,10 +59,29 @@ def translate_text(text, source_lang, target_lang):
         return translated_text
     except Exception as e:
         print(f"Translation error: {e}")
-        return text  # Return original text if translation fails
 def create_srt(segments, target_lang="en"):
-    """Convert whisper segments to SRT format with optional translation"""
     srt_content = ""
     for i, segment in enumerate(segments, start=1):
         start_time = format_timestamp(segment['start'])
@@ -64,7 +89,7 @@ def create_srt(segments, target_lang="en"):
         text = segment['text'].strip()
         # Translate if target language is different
-        if 'language' in segment and segment['language'] != target_lang:
             text = translate_text(text, segment['language'], target_lang)
         srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
@@ -81,7 +106,7 @@ def create_subtitle_clips(segments, videosize, target_lang="en"):
         text = segment['text'].strip()
         # Translate if target language is different
-        if 'language' in segment and segment['language'] != target_lang:
             text = translate_text(text, segment['language'], target_lang)
         text_clip = mp.TextClip(
@@ -103,26 +128,65 @@ def create_subtitle_clips(segments, videosize, target_lang="en"):
 @spaces.GPU
 def process_video(video_path, target_lang="en"):
     """Main function to process video and add subtitles with translation"""
-    # Load Whisper model
-    model = whisper.load_model("base")
-    # Extract audio from video
-    video = mp.VideoFileClip(video_path)
-    audio = video.audio
-    # Save audio temporarily
-    temp_audio_path = "temp_audio.wav"
-    audio.write_audiofile(temp_audio_path)
     # Transcribe audio
-    result = model.transcribe(temp_audio_path)
     # Add detected language to segments
-    for segment in result["segments"]:
-        segment['language'] = result["language"]
-    # Create SRT content with translation
-    srt_content = create_srt(result["segments"], target_lang)
     # Save SRT file
     video_name = os.path.splitext(os.path.basename(video_path))[0]
@@ -130,8 +194,8 @@ def process_video(video_path, target_lang="en"):
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_content)
-    # Create subtitle clips with translation
-    subtitle_clips = create_subtitle_clips(result["segments"], video.size, target_lang)
     # Combine video with subtitles
     final_video = mp.CompositeVideoClip([video] + subtitle_clips)
@@ -172,8 +236,8 @@ iface = gr.Interface(
         gr.Video(label="Video with Subtitles"),
         gr.File(label="SRT Subtitle File")
     ],
-    title="Video Subtitler with Translation",
-    description="Upload a video to generate subtitles, translate them to your chosen language, and embed them directly in the video."
 )
 if __name__ == "__main__":

 import gradio as gr
 import moviepy.editor as mp
 from moviepy.video.tools.subtitles import SubtitlesClip
 from datetime import timedelta
 import os
+from transformers import (
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    MarianMTModel,
+    MarianTokenizer,
+    pipeline
+)
 import torch
+import numpy as np
+from pydub import AudioSegment
 import spaces
 # Dictionary of supported languages and their codes for MarianMT
     "Korean": "ko"
 }
 def get_model_name(source_lang, target_lang):
+    """Get MarianMT model name for language pair"""
     return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
 def format_timestamp(seconds):
         tokenizer = MarianTokenizer.from_pretrained(model_name)
         model = MarianMTModel.from_pretrained(model_name)
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
         translated = model.generate(**inputs)
         translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
         return translated_text
     except Exception as e:
         print(f"Translation error: {e}")
+        return text
+def load_audio(video_path):
+    """Extract and load audio from video file"""
+    video = mp.VideoFileClip(video_path)
+    temp_audio_path = "temp_audio.wav"
+    video.audio.write_audiofile(temp_audio_path)
+    # Load audio using pydub
+    audio = AudioSegment.from_wav(temp_audio_path)
+    audio_array = np.array(audio.get_array_of_samples())
+    # Convert to float32 and normalize
+    audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max
+    # If stereo, convert to mono
+    if len(audio_array.shape) > 1:
+        audio_array = audio_array.mean(axis=1)
+    return audio_array, audio.frame_rate, video, temp_audio_path
 def create_srt(segments, target_lang="en"):
+    """Convert transcribed segments to SRT format with optional translation"""
     srt_content = ""
     for i, segment in enumerate(segments, start=1):
         start_time = format_timestamp(segment['start'])
         text = segment['text'].strip()
         # Translate if target language is different
+        if segment.get('language') and segment['language'] != target_lang:
             text = translate_text(text, segment['language'], target_lang)
         srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
         text = segment['text'].strip()
         # Translate if target language is different
+        if segment.get('language') and segment['language'] != target_lang:
             text = translate_text(text, segment['language'], target_lang)
         text_clip = mp.TextClip(
 @spaces.GPU
 def process_video(video_path, target_lang="en"):
     """Main function to process video and add subtitles with translation"""
+    # Load CrisperWhisper model and processor
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model_id = "nyrahealth/CrisperWhisper"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+        low_cpu_mem_usage=True,
+        use_safetensors=True
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    # Load audio and video
+    audio_array, sampling_rate, video, temp_audio_path = load_audio(video_path)
+    # Create pipeline
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        max_new_tokens=128,
+        chunk_length_s=30,
+        batch_size=16,
+        return_timestamps=True,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+        device=device,
+    )
     # Transcribe audio
+    result = pipe(audio_array, return_timestamps="word")
+    # Convert word-level timestamps to segments
+    segments = []
+    current_segment = {"text": "", "start": result["chunks"][0]["timestamp"][0]}
+    for chunk in result["chunks"]:
+        current_segment["text"] += " " + chunk["text"]
+        current_segment["end"] = chunk["timestamp"][1]
+        # Start new segment if text is long enough or enough time has passed
+        if len(current_segment["text"].split()) > 10 or \
+           (current_segment["end"] - current_segment["start"]) > 5.0:
+            segments.append(current_segment)
+            if chunk != result["chunks"][-1]:  # If not the last chunk
+                current_segment = {"text": "", "start": chunk["timestamp"][1]}
+    # Add last segment if not empty
+    if current_segment["text"]:
+        segments.append(current_segment)
     # Add detected language to segments
+    detected_language = "en"  # CrisperWhisper is English-focused
+    for segment in segments:
+        segment['language'] = detected_language
+    # Create SRT content
+    srt_content = create_srt(segments, target_lang)
     # Save SRT file
     video_name = os.path.splitext(os.path.basename(video_path))[0]
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_content)
+    # Create subtitle clips
+    subtitle_clips = create_subtitle_clips(segments, video.size, target_lang)
     # Combine video with subtitles
     final_video = mp.CompositeVideoClip([video] + subtitle_clips)
         gr.Video(label="Video with Subtitles"),
         gr.File(label="SRT Subtitle File")
     ],
+    title="Video Subtitler with CrisperWhisper",
+    description="Upload a video to generate subtitles using CrisperWhisper, translate them to your chosen language, and embed them directly in the video."
 )
 if __name__ == "__main__":