Spaces:

marianna13
/

search-inside-a-video

Runtime error

App Files Files Community

marianna13 commited on Dec 28, 2022

Commit

0edd243

1 Parent(s): 7968d81

Create app.py

Browse files

Files changed (1) hide show

app.py +166 -0

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr
+import yt_dlp
+import os
+import time
+import torch
+from multilingual_clip import pt_multilingual_clip
+import transformers
+import clip
+import numpy as np
+import cv2
+import random
+from PIL import Image
+os.system('%cd /Multilingual-CLIP && bash get-weights.sh')
+class SearchVideo:
+    def __init__(
+            self,
+            clip_model: str,
+            text_model: str,
+            tokenizer,
+            compose,
+            ) -> None:
+        """
+        clip_model: CLIP model to use for image embeddings
+        text_model: text encoder model
+        """
+        self.text_model = text_model
+        self.tokenizer = tokenizer
+        self.clip_model = clip_model
+        self.compose = compose
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    def __call__(self, video: str, text: str) -> list:
+        torch.cuda.empty_cache()
+        img_list = []
+        text_list = []
+        frames = self.video2frames_ffmpeg(video)
+        img_embs = self.get_img_embs(frames)
+        txt_emb = self.get_txt_embs(text)
+        # txt_emb = [[t]*len(frames) for t in txt_emb]
+        txt_emb = txt_emb*len(frames)
+        logits_per_image = self.compare_embeddings(img_embs, txt_emb)
+        logits_per_image = [logit.numpy()[0] for logit in logits_per_image]
+        ind = np.argmax(logits_per_image)
+        seg_path = self.extract_seg(video, ind)
+        return ind, seg_path, frames[ind]
+    def extract_seg(self, video:str, start:int):
+        start = start if start > 5 else start-5
+        start = time.strftime('%H:%M:%S', time.gmtime(start))
+        cmd = f'ffmpeg -ss {start} -i "{video}" -t 00:00:05 -vcodec copy -acodec copy -y segment_{start}.mp4'
+        os.system(cmd)
+        return f'segment_{start}.mp4'
+    def video2frames_ffmpeg(self, video: str) -> list:
+          frames_dir = 'frames'
+          if not os.path.exists(frames_dir):
+            os.makedirs(frames_dir)
+          select = "select='if(eq(n\,0),1,floor(t)-floor(prev_selected_t))'"
+          os.system(f'ffmpeg -i {video} -r 1 {frames_dir}/output-%04d.jpg')
+          images = [Image.open(f'{frames_dir}/{f}') for f in sorted(os.listdir(frames_dir))]
+          os.system(f'rm -rf {frames_dir}')
+          return images
+    def video2frames(self, video: str) -> list:
+          cap = cv2.VideoCapture(video)
+          num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+          images = []
+          frames_sec = [i for i in range(0, num_frames, 24*1)]
+          has_frames,image = cap.read()
+          frame_count = 0
+          while has_frames:
+              has_frames,image = cap.read()
+              frame_count += 1
+              if has_frames:
+                  if frame_count in frames_sec:
+                    image = Image.fromarray(image)
+                    images.append(image)
+          return images
+    def get_img_embs(self, img_list: list) -> list:
+        """
+        takes list of image and calculates clip embeddings with model specified by clip_model
+        """
+        img_input = torch.stack([self.compose(img).to(self.device)
+                                for img in img_list])
+        with torch.no_grad():
+            image_embs = self.clip_model.encode_image(img_input).float().cpu()
+            return image_embs
+    def get_txt_embs(self, text: str) -> torch.Tensor:
+        "calculates clip emebdding for the text "
+        with torch.no_grad():
+            return self.text_model(text, self.tokenizer)
+    def compare_embeddings(self, img_embs, txt_embs):
+        # normalized features
+        image_features = img_embs / img_embs.norm(dim=-1, keepdim=True)
+        text_features = txt_embs / txt_embs.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logits_per_image = []
+        for image_feature in image_features:
+          logits_per_image.append(image_feature @ text_features.t())
+        return logits_per_image
+def download_yt_video(url):
+  ydl_opts = {
+        'quiet': True,
+        "outtmpl": "%(id)s.%(ext)s",
+        'format': 'bv*[height<=360][ext=mp4]+ba/b[height<=360] / wv*+ba/w'
+    }
+  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+    ydl.download([url])
+    return url.split('/')[-1].replace('watch?v=', '')+'.mp4'
+clip_model='ViT-B/32'
+text_model='M-CLIP/XLM-Roberta-Large-Vit-B-32'
+clip_model, compose = clip.load(clip_model)
+tokenizer = transformers.AutoTokenizer.from_pretrained(text_model)
+text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(text_model)
+def search_video(video_url, text, video=None):
+    search = SearchVideo(
+        clip_model=clip_model,
+        text_model=text_model,
+        tokenizer=tokenizer,
+        compose=compose
+    )
+    if video !=None:
+      video_url = None
+    if video_url:
+        video = download_yt_video(video_url)
+    ind, seg_path, img = search(video, text)
+    start = time.strftime('%H:%M:%S', time.gmtime(ind))
+    return f'"{text}" found at {start}',  seg_path
+title = '🔎🎞️🚀 Search inside a video'
+description = '''Just enter a search query, a video URL or upload your video and get a 5-sec fragment from the video which is visually closest to you query.'''
+examples = [["https://www.youtube.com/watch?v=M93w3TjzVUE", "A dog"]]
+iface = gr.Interface(
+    search_video,
+    inputs=[gr.Textbox(value="https://www.youtube.com/watch?v=M93w3TjzVUE", label='Video URL'), gr.Textbox(value="a dog", label='Text query'), gr.Video()],
+    outputs=[gr.Textbox(label="Output"), gr.Video(label="Video segment")],
+    allow_flagging="never",
+    title=title,
+    description=description,
+    examples=examples
+    )
+if __name__ == "__main__":
+  iface.launch(show_error=True)