flocolombari commited on
Commit
e95b86d
1 Parent(s): af073f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -43
app.py CHANGED
@@ -8,60 +8,56 @@ import scipy
8
  #Commit
9
  def video_to_descriptions(video, target_language="en"):
10
  # Load the image-to-text and summarization pipelines
11
- #ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
12
- #Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
13
 
14
  # Load the translation pipeline for the target language
15
- #translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
16
- audio = pipeline("text-to-speech", model="suno/bark-small")
17
- #
18
- ## Open the video
19
- #cap = cv2.VideoCapture(video)
20
- #fps = int(cap.get(cv2.CAP_PROP_FPS))
21
- #
22
- #descriptions = []
23
- #frame_count = 0
24
- #
25
- #while True:
26
- # ret, frame = cap.read()
27
- # if not ret:
28
- # break
29
- #
30
- # # Extract an image every 2 seconds
31
- # if frame_count % (fps * 2) == 0:
32
- # # Convert the image to RGB
33
- # frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
34
- # # Convert the numpy array to a PIL image
35
- # pil_img = Image.fromarray(frame_rgb)
36
- # # Get the image description
37
- # outputs = ImgToText(pil_img)
38
- # description = outputs[0]['generated_text']
39
- # descriptions.append(description)
40
- # print(str(frame_count) + " : " + outputs[0]['generated_text'])
41
- #
42
- # frame_count += 1
43
-
44
- ## Close the video reader
45
- #cap.release()
46
-
47
- ## Concatenate the descriptions
48
- #concatenated_description = " ".join(descriptions)
49
- #summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"]
50
- #print("SUMMARIZATION : " + summarized_description)
51
 
52
- #translated_text = translator(summarized_description)[0]["translation_text"]
53
- #print("TRANSLATION : " + translated_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- print(audio("bonjour je m'appelle Florent et je fais un test"))
 
56
 
57
- audio_file = audio("bonjour je m'appelle Florent et je fais un test")
58
  print(audio_file)
59
 
60
  output_path = "./bark_out.wav"
61
  scipy.io.wavfile.write(output_path, rate=audio_file["sampling_rate"], data=audio_file["audio"][0].squeeze())
62
 
63
  return output_path
64
- #return translated_text
65
 
66
  # Create a dropdown menu with language options
67
  language_dropdown = gr.Dropdown(
 
8
  #Commit
9
  def video_to_descriptions(video, target_language="en"):
10
  # Load the image-to-text and summarization pipelines
11
+ ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
12
+ Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
13
 
14
  # Load the translation pipeline for the target language
15
+ translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
16
+ audio = pipeline("text-to-speech", model="suno/bark")
17
+
18
+ # Open the video
19
+ cap = cv2.VideoCapture(video)
20
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ descriptions = []
23
+ frame_count = 0
24
+
25
+ while True:
26
+ ret, frame = cap.read()
27
+ if not ret:
28
+ break
29
+
30
+ # Extract an image every 2 seconds
31
+ if frame_count % (fps * 2) == 0:
32
+ # Convert the image to RGB
33
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
34
+ # Convert the numpy array to a PIL image
35
+ pil_img = Image.fromarray(frame_rgb)
36
+ # Get the image description
37
+ outputs = ImgToText(pil_img)
38
+ description = outputs[0]['generated_text']
39
+ descriptions.append(description)
40
+ print(str(frame_count) + " : " + outputs[0]['generated_text'])
41
+
42
+ frame_count += 1
43
+
44
+ # Close the video reader
45
+ cap.release()
46
+ # Concatenate the descriptions
47
+ concatenated_description = " ".join(descriptions)
48
+ summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"]
49
+ print("SUMMARIZATION : " + summarized_description)
50
 
51
+ translated_text = translator(summarized_description)[0]["translation_text"]
52
+ print("TRANSLATION : " + translated_text)
53
 
54
+ audio_file = audio("translated_text", )
55
  print(audio_file)
56
 
57
  output_path = "./bark_out.wav"
58
  scipy.io.wavfile.write(output_path, rate=audio_file["sampling_rate"], data=audio_file["audio"][0].squeeze())
59
 
60
  return output_path
 
61
 
62
  # Create a dropdown menu with language options
63
  language_dropdown = gr.Dropdown(