rafaaa2105 commited on
Commit
2986f68
1 Parent(s): 73c2589

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import gradio as gr
2
- import whisper
3
  import moviepy.editor as mp
4
  from deep_translator import GoogleTranslator
5
  from pydub import AudioSegment
6
  import os
7
  import tempfile
 
 
8
 
9
  def extract_audio(video_path):
10
  video = mp.VideoFileClip(video_path)
@@ -14,9 +15,23 @@ def extract_audio(video_path):
14
  return audio_path
15
 
16
  def generate_subtitles(audio_path):
17
- model = whisper.load_model("base")
18
- result = model.transcribe(audio_path)
19
- return result["segments"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def translate_subtitles(subtitles, target_language):
22
  translator = GoogleTranslator(source='auto', target=target_language)
 
1
  import gradio as gr
 
2
  import moviepy.editor as mp
3
  from deep_translator import GoogleTranslator
4
  from pydub import AudioSegment
5
  import os
6
  import tempfile
7
+ import torch
8
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
 
10
  def extract_audio(video_path):
11
  video = mp.VideoFileClip(video_path)
 
15
  return audio_path
16
 
17
  def generate_subtitles(audio_path):
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ processor = WhisperProcessor.from_pretrained("openai/whisper-base")
20
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(device)
21
+
22
+ # Load and preprocess the audio
23
+ audio_input, _ = librosa.load(audio_path, sr=16000)
24
+ input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features.to(device)
25
+
26
+ # Generate token ids
27
+ predicted_ids = model.generate(input_features)
28
+
29
+ # Decode token ids to text
30
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
31
+
32
+ # For simplicity, we're returning a single segment with the full transcription
33
+ # In a more advanced implementation, you might want to split this into multiple segments
34
+ return [{"start": 0, "end": len(audio_input) / 16000, "text": transcription[0]}]
35
 
36
  def translate_subtitles(subtitles, target_language):
37
  translator = GoogleTranslator(source='auto', target=target_language)