Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.system("pip freeze")
|
3 |
+
import cv2
|
4 |
+
from PIL import Image
|
5 |
+
import clip
|
6 |
+
import torch
|
7 |
+
import math
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
import datetime
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
|
14 |
+
# Load the open CLIP model
|
15 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def inference(video, text):
|
21 |
+
# The frame images will be stored in video_frames
|
22 |
+
video_frames = []
|
23 |
+
# Open the video file
|
24 |
+
|
25 |
+
capture = cv2.VideoCapture(video)
|
26 |
+
fps = capture.get(cv2.CAP_PROP_FPS)
|
27 |
+
|
28 |
+
current_frame = 0
|
29 |
+
# Read the current frame
|
30 |
+
ret, frame = capture.read()
|
31 |
+
while capture.isOpened() and ret:
|
32 |
+
ret,frame = capture.read()
|
33 |
+
print('Read a new frame: ', ret)
|
34 |
+
current_frame += 1
|
35 |
+
if ret:
|
36 |
+
video_frames.append(Image.fromarray(frame[:, :, ::-1]))
|
37 |
+
|
38 |
+
|
39 |
+
# Print some statistics
|
40 |
+
print(f"Frames extracted: {len(video_frames)}")
|
41 |
+
|
42 |
+
|
43 |
+
# You can try tuning the batch size for very large videos, but it should usually be OK
|
44 |
+
batch_size = 256
|
45 |
+
batches = math.ceil(len(video_frames) / batch_size)
|
46 |
+
|
47 |
+
# The encoded features will bs stored in video_features
|
48 |
+
video_features = torch.empty([0, 512], dtype=torch.float16).to(device)
|
49 |
+
|
50 |
+
# Process each batch
|
51 |
+
for i in range(batches):
|
52 |
+
print(f"Processing batch {i+1}/{batches}")
|
53 |
+
|
54 |
+
# Get the relevant frames
|
55 |
+
batch_frames = video_frames[i*batch_size : (i+1)*batch_size]
|
56 |
+
|
57 |
+
# Preprocess the images for the batch
|
58 |
+
batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
|
59 |
+
|
60 |
+
# Encode with CLIP and normalize
|
61 |
+
with torch.no_grad():
|
62 |
+
batch_features = model.encode_image(batch_preprocessed)
|
63 |
+
batch_features /= batch_features.norm(dim=-1, keepdim=True)
|
64 |
+
|
65 |
+
# Append the batch to the list containing all features
|
66 |
+
video_features = torch.cat((video_features, batch_features))
|
67 |
+
|
68 |
+
# Print some stats
|
69 |
+
print(f"Features: {video_features.shape}")
|
70 |
+
|
71 |
+
|
72 |
+
search_query=text
|
73 |
+
display_heatmap=False
|
74 |
+
display_results_count=1
|
75 |
+
# Encode and normalize the search query using CLIP
|
76 |
+
with torch.no_grad():
|
77 |
+
text_features = model.encode_text(clip.tokenize(search_query).to(device))
|
78 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
79 |
+
|
80 |
+
# Compute the similarity between the search query and each frame using the Cosine similarity
|
81 |
+
similarities = (100.0 * video_features @ text_features.T)
|
82 |
+
values, best_photo_idx = similarities.topk(display_results_count, dim=0)
|
83 |
+
|
84 |
+
|
85 |
+
for frame_id in best_photo_idx:
|
86 |
+
frame = video_frames[frame_id]
|
87 |
+
# Find the timestamp in the video and display it
|
88 |
+
seconds = round(frame_id.cpu().numpy()[0]/fps)
|
89 |
+
return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}"
|
90 |
+
|
91 |
+
title = "Video Search"
|
92 |
+
description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Read more at the links below."
|
93 |
+
article = "<p style='text-align: center'><a href='https://github.com/haltakov/natural-language-youtube-search' target='_blank'>Github Repo</a></p>"
|
94 |
+
|
95 |
+
examples=[['test.mp4',"gas station"]]
|
96 |
+
gr.Interface(
|
97 |
+
inference,
|
98 |
+
["video","text"],
|
99 |
+
[gr.outputs.Image(type="pil", label="Output"),"text"],
|
100 |
+
title=title,
|
101 |
+
description=description,
|
102 |
+
article=article,
|
103 |
+
examples=examples
|
104 |
+
).launch(debug=True,enable_queue=True)
|