Spaces:

marianna13
/

search-inside-a-video

Runtime error

File size: 5,677 Bytes

0edd243

import gradio as gr
import yt_dlp
import os
import time
import torch
from multilingual_clip import pt_multilingual_clip
import transformers
import clip
import numpy as np
import cv2
import random
from PIL import Image

os.system('%cd /Multilingual-CLIP && bash get-weights.sh')

class SearchVideo:

    def __init__(
            self,
            clip_model: str,
            text_model: str,
            tokenizer,
            compose,
            ) -> None:
        """ 
        clip_model: CLIP model to use for image embeddings
        text_model: text encoder model
        """
        self.text_model = text_model
        self.tokenizer = tokenizer
        self.clip_model = clip_model
        self.compose = compose
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        

    def __call__(self, video: str, text: str) -> list:
        torch.cuda.empty_cache()
        img_list = []
        text_list = []
        frames = self.video2frames_ffmpeg(video)
    

        img_embs = self.get_img_embs(frames)
        txt_emb = self.get_txt_embs(text)
        # txt_emb = [[t]*len(frames) for t in txt_emb]
        txt_emb = txt_emb*len(frames)

        logits_per_image = self.compare_embeddings(img_embs, txt_emb)
        logits_per_image = [logit.numpy()[0] for logit in logits_per_image]
        ind = np.argmax(logits_per_image)
        seg_path = self.extract_seg(video, ind)
        return ind, seg_path, frames[ind]


    def extract_seg(self, video:str, start:int):
        start = start if start > 5 else start-5
        start = time.strftime('%H:%M:%S', time.gmtime(start))
        cmd = f'ffmpeg -ss {start} -i "{video}" -t 00:00:05 -vcodec copy -acodec copy -y segment_{start}.mp4'
        os.system(cmd)
        return f'segment_{start}.mp4'

    def video2frames_ffmpeg(self, video: str) -> list:
          frames_dir = 'frames'
          if not os.path.exists(frames_dir):
            os.makedirs(frames_dir)

          select = "select='if(eq(n\,0),1,floor(t)-floor(prev_selected_t))'"
          os.system(f'ffmpeg -i {video} -r 1 {frames_dir}/output-%04d.jpg')

          images = [Image.open(f'{frames_dir}/{f}') for f in sorted(os.listdir(frames_dir))]
          os.system(f'rm -rf {frames_dir}')
          return images

    def video2frames(self, video: str) -> list:
          cap = cv2.VideoCapture(video)
          num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
          images = []
          frames_sec = [i for i in range(0, num_frames, 24*1)]
          has_frames,image = cap.read()
          frame_count = 0
          while has_frames:
              has_frames,image = cap.read()
              frame_count += 1
              if has_frames:
                  if frame_count in frames_sec:
                    image = Image.fromarray(image)
                    images.append(image)
          return images

    def get_img_embs(self, img_list: list) -> list:
        """
        takes list of image and calculates clip embeddings with model specified by clip_model
        """
        img_input = torch.stack([self.compose(img).to(self.device)
                                for img in img_list])
        with torch.no_grad():
            image_embs = self.clip_model.encode_image(img_input).float().cpu()
            return image_embs

    def get_txt_embs(self, text: str) -> torch.Tensor:
        "calculates clip emebdding for the text "
        with torch.no_grad():
            return self.text_model(text, self.tokenizer)

    def compare_embeddings(self, img_embs, txt_embs):
        # normalized features
        image_features = img_embs / img_embs.norm(dim=-1, keepdim=True)
        text_features = txt_embs / txt_embs.norm(dim=-1, keepdim=True)

        # cosine similarity as logits
        logits_per_image = []
        for image_feature in image_features:
          logits_per_image.append(image_feature @ text_features.t())

        return logits_per_image

def download_yt_video(url):
  ydl_opts = {
        'quiet': True,
        "outtmpl": "%(id)s.%(ext)s", 
        'format': 'bv*[height<=360][ext=mp4]+ba/b[height<=360] / wv*+ba/w'
    }

  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])
    return url.split('/')[-1].replace('watch?v=', '')+'.mp4'


clip_model='ViT-B/32'
text_model='M-CLIP/XLM-Roberta-Large-Vit-B-32'
clip_model, compose = clip.load(clip_model)
tokenizer = transformers.AutoTokenizer.from_pretrained(text_model)
text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(text_model)

def search_video(video_url, text, video=None):
    search = SearchVideo(
        clip_model=clip_model,
        text_model=text_model,
        tokenizer=tokenizer,
        compose=compose
    )
    if video !=None:
      video_url = None
    if video_url:
        video = download_yt_video(video_url)
    ind, seg_path, img = search(video, text)
    start = time.strftime('%H:%M:%S', time.gmtime(ind))
    return f'"{text}" found at {start}',  seg_path

title = '🔎🎞️🚀 Search inside a video'
description = '''Just enter a search query, a video URL or upload your video and get a 5-sec fragment from the video which is visually closest to you query.'''

examples = [["https://www.youtube.com/watch?v=M93w3TjzVUE", "A dog"]]

iface = gr.Interface(
    search_video, 
    inputs=[gr.Textbox(value="https://www.youtube.com/watch?v=M93w3TjzVUE", label='Video URL'), gr.Textbox(value="a dog", label='Text query'), gr.Video()], 
    outputs=[gr.Textbox(label="Output"), gr.Video(label="Video segment")], 
    allow_flagging="never",
    title=title,
    description=description,
    examples=examples
    )

if __name__ == "__main__":
  iface.launch(show_error=True)