flocolombari commited on
Commit
351301f
·
1 Parent(s): ea6335a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -6
app.py CHANGED
@@ -1,20 +1,38 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import cv2
4
  from PIL import Image
5
  import io
6
  import scipy
 
 
 
7
 
8
  #Commit
9
  def video_to_descriptions(video, target_language="en"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Load the image-to-text and summarization pipelines
11
  ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
12
  Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
13
 
14
  # Load the translation pipeline for the target language
15
  translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
16
- audio = pipeline("text-to-speech", model="suno/bark-small")
17
- voice_preset = f"v2/{target_language}_speaker_2"
18
 
19
  # Open the video
20
  cap = cv2.VideoCapture(video)
@@ -52,11 +70,16 @@ def video_to_descriptions(video, target_language="en"):
52
  translated_text = translator(summarized_description)[0]["translation_text"]
53
  print("TRANSLATION : " + translated_text)
54
 
55
- audio_file = audio("translated_text", voice_preset=voice_preset)
56
- print(audio_file)
 
 
 
 
 
57
 
58
  output_path = "./bark_out.wav"
59
- scipy.io.wavfile.write(output_path, rate=audio_file["sampling_rate"], data=audio_file["audio"][0].squeeze())
60
 
61
  return output_path
62
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoProcessor, BarkModel
3
  import cv2
4
  from PIL import Image
5
  import io
6
  import scipy
7
+ import torch
8
+
9
+
10
 
11
  #Commit
12
  def video_to_descriptions(video, target_language="en"):
13
+
14
+ modelname="suno/bark-small"
15
+ processor = AutoProcessor.from_pretrained(hgmodelname)
16
+ model = BarkModel.from_pretrained(modelname, torch_dtype=torch.float16).to(device)
17
+ model = BetterTransformer.transform(model, keep_original_model=False)
18
+
19
+ # enable CPU offload
20
+ model.enable_cpu_offload()
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
  # Load the image-to-text and summarization pipelines
29
  ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
30
  Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
31
 
32
  # Load the translation pipeline for the target language
33
  translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
34
+ voice_preset = f"v2/{target_language}_speaker_1"
35
+
36
 
37
  # Open the video
38
  cap = cv2.VideoCapture(video)
 
70
  translated_text = translator(summarized_description)[0]["translation_text"]
71
  print("TRANSLATION : " + translated_text)
72
 
73
+ inputs = processor(translated_text, voice_preset=voice_preset)
74
+ #audio_file = audio("translated_text", voice_preset=voice_preset)
75
+
76
+
77
+ audio_array = model.generate(**inputs)
78
+ audio_array = audio_array.cpu().numpy().squeeze()
79
+ sample_rate = model.generation_config.sample_rate
80
 
81
  output_path = "./bark_out.wav"
82
+ scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
83
 
84
  return output_path
85