File size: 4,610 Bytes
44d9f1d
1fc91dd
0cad1a1
d4bf58f
 
4d98794
351301f
54a4328
80efb09
351301f
4a02f29
 
 
ed6818e
4a02f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351301f
8bbef32
4a02f29
6fbe27e
4a02f29
 
1fc91dd
8bbef32
17b72e4
8bbef32
e95b86d
 
 
1fc91dd
8bbef32
351301f
8bbef32
e95b86d
 
199eed7
e95b86d
 
 
 
 
 
 
4a02f29
 
8bbef32
e95b86d
8bbef32
e95b86d
8bbef32
e95b86d
 
4a02f29
 
 
 
 
 
 
 
 
e95b86d
 
 
8bbef32
4a02f29
 
 
 
84029b5
4a02f29
 
 
 
 
 
 
 
e95b86d
d700d5f
e95b86d
 
199eed7
1fc91dd
54043c1
 
5bd69a0
54a4328
 
 
 
54043c1
0460dda
5c64ea3
8bbef32
5c64ea3
44d9f1d
 
0cad1a1
8bbef32
6e90515
81fc95f
44d9f1d
 
cb97970
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io
import scipy
import torch
import time
import numpy as np

def detect_scene_changes(video_path, threshold):
    """
    Détecte les changements de plan dans une vidéo.

    Parameters:
    - video_path: chemin vers le fichier vidéo
    - threshold: seuil de différence pour détecter un changement de plan

    Returns:
    Une liste des numéros d'images où un changement de plan est détecté.
    """
    
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Erreur lors de l'ouverture de la vidéo.")
        return []

    ret, prev_frame = cap.read()
    if not ret:
        print("Erreur lors de la lecture de la vidéo.")
        return []

    prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    
    scene_changes = []

    frame_number = 0
    while True:
        ret, current_frame = cap.read()
        if not ret:
            break
        
        current_frame_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)

        # Calculer la différence absolue entre les deux images
        diff = cv2.absdiff(prev_frame_gray, current_frame_gray)
        mean_diff = np.mean(diff)

        if mean_diff > threshold:
            scene_changes.append(frame_number)

        prev_frame_gray = current_frame_gray
        frame_number += 1

    cap.release()
    return scene_changes

def video_to_descriptions(video, target_language="en"):

    threshold = 45.0

    scene_changes = detect_scene_changes(video, threshold)
    
    start_time = time.time()
    print("START TIME = ", start_time)

    ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
    translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
    audio = pipeline("text-to-speech", model="suno/bark-small")

    voice_preset = f"v2/{target_language}_speaker_1"

    cap = cv2.VideoCapture(video)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    descriptions = []
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
   
        if (frame_count % (fps * 3) == 0) or (frame_count in scene_changes) :

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            pil_img = Image.fromarray(frame_rgb)

            outputs = ImgToText(pil_img)
            description = outputs[0]['generated_text']

            if (frame_count in scene_changes):
                descriptions.append(" There has been a scene change, now we can observe " + description)
                print(str(frame_count) + " | CHANGEMENT DE PLAN | " + outputs[0]['generated_text'])
                
            else:
                descriptions.append(" we can see that " + description)
                print(str(frame_count) + " | " + outputs[0]['generated_text'])
            
        frame_count += 1

    cap.release()

    concatenated_description = " ".join(descriptions).split("There has been a scene change, now we can observe")
    plan_number = 1
    summarized_description = f"We can see the Scene number {plan_number}, where "
    
    for plan in concatenated_description:
        if not (summarized_description == "We can see the Scene number 1, where "):
            summarized_description += f"There has been a scene change, now we can observe the Scene number {plan_number}, where "
            summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
            plan_number += 1
        else:
            summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
            plan_number += 1
            
    print("SUMMARIZATION : " + summarized_description)

    translated_text = translator(summarized_description)[0]["translation_text"]
    print("TRANSLATION : " + translated_text)
    
    audio_file = audio(translated_text)

    output_path = "./bark_out.wav"
    scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"])

    stop_time = time.time()

    print("EXECUTION TIME = ", stop_time - start_time)
    return output_path

language_dropdown = gr.Dropdown(
            ["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language"
        )

iface = gr.Interface(
    fn=video_to_descriptions,
    inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown],
    outputs="audio",
    live=False
)

if __name__ == "__main__":
    iface.launch()