h4d35 commited on
Commit
cab3a12
Β·
1 Parent(s): ed0f956

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip freeze")
3
+ import cv2
4
+ from PIL import Image
5
+ import clip
6
+ import torch
7
+ import math
8
+ import numpy as np
9
+ import torch
10
+ import datetime
11
+ import gradio as gr
12
+
13
+
14
+ # Load the open CLIP model
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ model, preprocess = clip.load("ViT-B/32", device=device)
17
+
18
+
19
+
20
+ def inference(video, text):
21
+ # The frame images will be stored in video_frames
22
+ video_frames = []
23
+ # Open the video file
24
+
25
+ capture = cv2.VideoCapture(video)
26
+ fps = capture.get(cv2.CAP_PROP_FPS)
27
+
28
+ current_frame = 0
29
+ # Read the current frame
30
+ ret, frame = capture.read()
31
+ while capture.isOpened() and ret:
32
+ ret,frame = capture.read()
33
+ print('Read a new frame: ', ret)
34
+ current_frame += 1
35
+ if ret:
36
+ video_frames.append(Image.fromarray(frame[:, :, ::-1]))
37
+
38
+
39
+ # Print some statistics
40
+ print(f"Frames extracted: {len(video_frames)}")
41
+
42
+
43
+ # You can try tuning the batch size for very large videos, but it should usually be OK
44
+ batch_size = 256
45
+ batches = math.ceil(len(video_frames) / batch_size)
46
+
47
+ # The encoded features will bs stored in video_features
48
+ video_features = torch.empty([0, 512], dtype=torch.float16).to(device)
49
+
50
+ # Process each batch
51
+ for i in range(batches):
52
+ print(f"Processing batch {i+1}/{batches}")
53
+
54
+ # Get the relevant frames
55
+ batch_frames = video_frames[i*batch_size : (i+1)*batch_size]
56
+
57
+ # Preprocess the images for the batch
58
+ batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
59
+
60
+ # Encode with CLIP and normalize
61
+ with torch.no_grad():
62
+ batch_features = model.encode_image(batch_preprocessed)
63
+ batch_features /= batch_features.norm(dim=-1, keepdim=True)
64
+
65
+ # Append the batch to the list containing all features
66
+ video_features = torch.cat((video_features, batch_features))
67
+
68
+ # Print some stats
69
+ print(f"Features: {video_features.shape}")
70
+
71
+
72
+ search_query=text
73
+ display_heatmap=False
74
+ display_results_count=1
75
+ # Encode and normalize the search query using CLIP
76
+ with torch.no_grad():
77
+ text_features = model.encode_text(clip.tokenize(search_query).to(device))
78
+ text_features /= text_features.norm(dim=-1, keepdim=True)
79
+
80
+ # Compute the similarity between the search query and each frame using the Cosine similarity
81
+ similarities = (100.0 * video_features @ text_features.T)
82
+ values, best_photo_idx = similarities.topk(display_results_count, dim=0)
83
+
84
+
85
+ for frame_id in best_photo_idx:
86
+ frame = video_frames[frame_id]
87
+ # Find the timestamp in the video and display it
88
+ seconds = round(frame_id.cpu().numpy()[0]/fps)
89
+ return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}"
90
+
91
+ title = "Video Search"
92
+ description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Read more at the links below."
93
+ article = "<p style='text-align: center'><a href='https://github.com/haltakov/natural-language-youtube-search' target='_blank'>Github Repo</a></p>"
94
+
95
+ examples=[['test.mp4',"gas station"]]
96
+ gr.Interface(
97
+ inference,
98
+ ["video","text"],
99
+ [gr.outputs.Image(type="pil", label="Output"),"text"],
100
+ title=title,
101
+ description=description,
102
+ article=article,
103
+ examples=examples
104
+ ).launch(debug=True,enable_queue=True)