import os #os.system("pip freeze") import cv2 from PIL import Image import clip import torch import math import numpy as np import torch import datetime import gradio as gr # Load the open CLIP model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) def inference(video, text): # The frame images will be stored in video_frames video_frames = [] # Open the video file capture = cv2.VideoCapture(video) fps = capture.get(cv2.CAP_PROP_FPS) current_frame = 0 # Read the current frame ret, frame = capture.read() while capture.isOpened() and ret: ret,frame = capture.read() print('Read a new frame: ', ret) current_frame += 1 if ret: video_frames.append(Image.fromarray(frame[:, :, ::-1])) # Print some statistics print(f"Frames extracted: {len(video_frames)}") # You can try tuning the batch size for very large videos, but it should usually be OK batch_size = 256 batches = math.ceil(len(video_frames) / batch_size) # The encoded features will bs stored in video_features video_features = torch.empty([0, 512], dtype=torch.float16).to(device) # Process each batch for i in range(batches): print(f"Processing batch {i+1}/{batches}") # Get the relevant frames batch_frames = video_frames[i*batch_size : (i+1)*batch_size] # Preprocess the images for the batch batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device) # Encode with CLIP and normalize with torch.no_grad(): batch_features = model.encode_image(batch_preprocessed) batch_features /= batch_features.norm(dim=-1, keepdim=True) # Append the batch to the list containing all features video_features = torch.cat((video_features, batch_features)) # Print some stats print(f"Features: {video_features.shape}") search_query=text display_heatmap=False display_results_count=1 # Encode and normalize the search query using CLIP with torch.no_grad(): text_features = model.encode_text(clip.tokenize(search_query).to(device)) text_features /= text_features.norm(dim=-1, keepdim=True) # Compute the similarity between the search query and each frame using the Cosine similarity similarities = (100.0 * video_features @ text_features.T) values, best_photo_idx = similarities.topk(display_results_count, dim=0) for frame_id in best_photo_idx: frame = video_frames[frame_id] # Find the timestamp in the video and display it seconds = round(frame_id.cpu().numpy()[0]/fps) return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}" title = "Video Search" description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Tororo's demo for Video Analytics." #examples=[['test.mp4',"gas station"]] gr.Interface( inference, ["video","text"], [gr.outputs.Image(type="pil", label="Output"),"text"], title=title, description=description ).launch(debug=True,enable_queue=True)