Spaces:

marianna13
/

search-inside-a-video

Runtime error

App Files Files Community

search-inside-a-video / app.py

marianna13

Create app.py

0edd243 about 2 years ago

raw

history blame

5.68 kB

	import gradio as gr
	import yt_dlp
	import os
	import time
	import torch
	from multilingual_clip import pt_multilingual_clip
	import transformers
	import clip
	import numpy as np
	import cv2
	import random
	from PIL import Image

	os.system('%cd /Multilingual-CLIP && bash get-weights.sh')

	class SearchVideo:

	def __init__(
	self,
	clip_model: str,
	text_model: str,
	tokenizer,
	compose,
	) -> None:
	"""
	clip_model: CLIP model to use for image embeddings
	text_model: text encoder model
	"""
	self.text_model = text_model
	self.tokenizer = tokenizer
	self.clip_model = clip_model
	self.compose = compose
	self.device = "cuda" if torch.cuda.is_available() else "cpu"


	def __call__(self, video: str, text: str) -> list:
	torch.cuda.empty_cache()
	img_list = []
	text_list = []
	frames = self.video2frames_ffmpeg(video)


	img_embs = self.get_img_embs(frames)
	txt_emb = self.get_txt_embs(text)
	# txt_emb = [[t]*len(frames) for t in txt_emb]
	txt_emb = txt_emb*len(frames)

	logits_per_image = self.compare_embeddings(img_embs, txt_emb)
	logits_per_image = [logit.numpy()[0] for logit in logits_per_image]
	ind = np.argmax(logits_per_image)
	seg_path = self.extract_seg(video, ind)
	return ind, seg_path, frames[ind]


	def extract_seg(self, video:str, start:int):
	start = start if start > 5 else start-5
	start = time.strftime('%H:%M:%S', time.gmtime(start))
	cmd = f'ffmpeg -ss {start} -i "{video}" -t 00:00:05 -vcodec copy -acodec copy -y segment_{start}.mp4'
	os.system(cmd)
	return f'segment_{start}.mp4'

	def video2frames_ffmpeg(self, video: str) -> list:
	frames_dir = 'frames'
	if not os.path.exists(frames_dir):
	os.makedirs(frames_dir)

	select = "select='if(eq(n\,0),1,floor(t)-floor(prev_selected_t))'"
	os.system(f'ffmpeg -i {video} -r 1 {frames_dir}/output-%04d.jpg')

	images = [Image.open(f'{frames_dir}/{f}') for f in sorted(os.listdir(frames_dir))]
	os.system(f'rm -rf {frames_dir}')
	return images

	def video2frames(self, video: str) -> list:
	cap = cv2.VideoCapture(video)
	num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	images = []
	frames_sec = [i for i in range(0, num_frames, 24*1)]
	has_frames,image = cap.read()
	frame_count = 0
	while has_frames:
	has_frames,image = cap.read()
	frame_count += 1
	if has_frames:
	if frame_count in frames_sec:
	image = Image.fromarray(image)
	images.append(image)
	return images

	def get_img_embs(self, img_list: list) -> list:
	"""
	takes list of image and calculates clip embeddings with model specified by clip_model
	"""
	img_input = torch.stack([self.compose(img).to(self.device)
	for img in img_list])
	with torch.no_grad():
	image_embs = self.clip_model.encode_image(img_input).float().cpu()
	return image_embs

	def get_txt_embs(self, text: str) -> torch.Tensor:
	"calculates clip emebdding for the text "
	with torch.no_grad():
	return self.text_model(text, self.tokenizer)

	def compare_embeddings(self, img_embs, txt_embs):
	# normalized features
	image_features = img_embs / img_embs.norm(dim=-1, keepdim=True)
	text_features = txt_embs / txt_embs.norm(dim=-1, keepdim=True)

	# cosine similarity as logits
	logits_per_image = []
	for image_feature in image_features:
	logits_per_image.append(image_feature @ text_features.t())

	return logits_per_image

	def download_yt_video(url):
	ydl_opts = {
	'quiet': True,
	"outtmpl": "%(id)s.%(ext)s",
	'format': 'bv[height<=360][ext=mp4]+ba/b[height<=360] / wv+ba/w'
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])
	return url.split('/')[-1].replace('watch?v=', '')+'.mp4'


	clip_model='ViT-B/32'
	text_model='M-CLIP/XLM-Roberta-Large-Vit-B-32'
	clip_model, compose = clip.load(clip_model)
	tokenizer = transformers.AutoTokenizer.from_pretrained(text_model)
	text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(text_model)

	def search_video(video_url, text, video=None):
	search = SearchVideo(
	clip_model=clip_model,
	text_model=text_model,
	tokenizer=tokenizer,
	compose=compose
	)
	if video !=None:
	video_url = None
	if video_url:
	video = download_yt_video(video_url)
	ind, seg_path, img = search(video, text)
	start = time.strftime('%H:%M:%S', time.gmtime(ind))
	return f'"{text}" found at {start}', seg_path

	title = '🔎🎞️🚀 Search inside a video'
	description = '''Just enter a search query, a video URL or upload your video and get a 5-sec fragment from the video which is visually closest to you query.'''

	examples = [["https://www.youtube.com/watch?v=M93w3TjzVUE", "A dog"]]

	iface = gr.Interface(
	search_video,
	inputs=[gr.Textbox(value="https://www.youtube.com/watch?v=M93w3TjzVUE", label='Video URL'), gr.Textbox(value="a dog", label='Text query'), gr.Video()],
	outputs=[gr.Textbox(label="Output"), gr.Video(label="Video segment")],
	allow_flagging="never",
	title=title,
	description=description,
	examples=examples
	)

	if __name__ == "__main__":
	iface.launch(show_error=True)