import gradio as gr from transformers import pipeline import cv2 def video_to_descriptions(video): # Charger le modèle via pipeline model = pipeline('image-to-text', model='nlpconnect/vit-gpt2-image-captioning') # Ouvrir la vidéo cap = cv2.VideoCapture(video.name) fps = int(cap.get(cv2.CAP_PROP_FPS)) descriptions = [] frame_count = 0 while True: ret, frame = cap.read() if not ret: break # Extraire une image toutes les demi-secondes if frame_count % (fps // 2) == 0: # Convertir l'image en RGB frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Obtenir la description de l'image outputs = model(frame_rgb) description = outputs[0]['describe-text'] descriptions.append(description) frame_count += 1 # Fermer le lecteur vidéo cap.release() # Concaténer les descriptions concatenated_descriptions = " ".join(descriptions) return concatenated_descriptions iface = gr.Interface( fn=video_to_descriptions, inputs=gr.inputs.Video(type="file", label="Importez une vidéo"), outputs="text", live=False ) if __name__ == "__app__": iface.launch()