File size: 4,627 Bytes
44d9f1d 1fc91dd 0cad1a1 d4bf58f 4d98794 351301f 54a4328 80efb09 351301f 4a02f29 ed6818e 4a02f29 351301f 8bbef32 4a02f29 a12eca5 4a02f29 1fc91dd 8bbef32 17b72e4 8bbef32 e95b86d 1fc91dd 8bbef32 351301f 8bbef32 e95b86d 199eed7 e95b86d 4a02f29 8bbef32 e95b86d 8bbef32 e95b86d 8bbef32 e95b86d 4a02f29 e95b86d 8bbef32 ca78b3e 4a02f29 84029b5 4a02f29 e95b86d d700d5f ca78b3e e95b86d 199eed7 1fc91dd 54043c1 5bd69a0 54a4328 54043c1 0460dda 5c64ea3 8bbef32 5c64ea3 44d9f1d 0cad1a1 8bbef32 6e90515 81fc95f 44d9f1d cb97970 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io
import scipy
import torch
import time
import numpy as np
def detect_scene_changes(video_path, threshold):
"""
Détecte les changements de plan dans une vidéo.
Parameters:
- video_path: chemin vers le fichier vidéo
- threshold: seuil de différence pour détecter un changement de plan
Returns:
Une liste des numéros d'images où un changement de plan est détecté.
"""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print("Erreur lors de l'ouverture de la vidéo.")
return []
ret, prev_frame = cap.read()
if not ret:
print("Erreur lors de la lecture de la vidéo.")
return []
prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
scene_changes = []
frame_number = 0
while True:
ret, current_frame = cap.read()
if not ret:
break
current_frame_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
# Calculer la différence absolue entre les deux images
diff = cv2.absdiff(prev_frame_gray, current_frame_gray)
mean_diff = np.mean(diff)
if mean_diff > threshold:
scene_changes.append(frame_number)
prev_frame_gray = current_frame_gray
frame_number += 1
cap.release()
return scene_changes
def video_to_descriptions(video, target_language="en"):
threshold =25.0
scene_changes = detect_scene_changes(video, threshold)
start_time = time.time()
print("START TIME = ", start_time)
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
audio = pipeline("text-to-speech", model="suno/bark-small")
voice_preset = f"v2/{target_language}_speaker_1"
cap = cv2.VideoCapture(video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
descriptions = []
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if (frame_count % (fps * 3) == 0) or (frame_count in scene_changes) :
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(frame_rgb)
outputs = ImgToText(pil_img)
description = outputs[0]['generated_text']
if (frame_count in scene_changes):
descriptions.append(" There has been a scene change, now we can observe " + description)
print(str(frame_count) + " | CHANGEMENT DE PLAN | " + outputs[0]['generated_text'])
else:
descriptions.append(" we can see that " + description)
print(str(frame_count) + " | " + outputs[0]['generated_text'])
frame_count += 1
cap.release()
concatenated_description = " ".join(descriptions).split(" There has been a scene change, now we can observe")
plan_number = 1
summarized_description = f"We can see the Scene number {plan_number}, where "
for plan in concatenated_description:
if not (summarized_description == "We can see the Scene number 1, where "):
summarized_description += f"There has been a scene change, now we can observe the Scene number {plan_number}, where "
summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
plan_number += 1
else:
summarized_description += Summarize(plan, max_length=20)[0]["summary_text"]
plan_number += 1
print("SUMMARIZATION : " + summarized_description)
translated_text = translator(summarized_description, max_length=2560)[0]["translation_text"]
print("TRANSLATION : " + translated_text)
audio_file = audio(translated_text)
output_path = "./bark_out.wav"
scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"])
stop_time = time.time()
print("EXECUTION TIME = ", stop_time - start_time)
return output_path
language_dropdown = gr.Dropdown(
["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language"
)
iface = gr.Interface(
fn=video_to_descriptions,
inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown],
outputs="audio",
live=False
)
if __name__ == "__main__":
iface.launch() |