video-search / app.py
Abhilashvj's picture
Update app.py
f25ff37 verified
raw
history blame
6.27 kB
import streamlit as st
import torch
from PIL import Image
import face_recognition
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import cv2
import numpy as np
import subprocess
import tempfile
import os
import yt_dlp
from moviepy.editor import VideoFileClip
# Helper functions
def get_video_id(url):
return url.split("v=")[1].split("&")[0]
def download_youtube_video(url, output_path):
ydl_opts = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'outtmpl': os.path.join(output_path, '%(id)s.%(ext)s'),
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
return filename
def process_video(video_url, output_dir, video_id):
# Placeholder for video processing logic
# This should include face detection, object detection, transcription, etc.
# For now, we'll just download the video
video_path = download_youtube_video(video_url, output_dir)
# Extract frames (simplified version)
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frame_count / fps
frames = []
frame_times = []
for i in range(0, frame_count, int(fps)): # Extract one frame per second
video.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = video.read()
if ret:
frames.append(frame)
frame_times.append(i / fps)
video.release()
return {
'video_path': video_path,
'frames': frames,
'frame_times': frame_times,
'duration': duration,
'fps': fps
}
def search(query, index_path, metadata_path, model):
# Placeholder for search functionality
# This should use FAISS for efficient similarity search
return [], []
# Load models
@st.cache_resource
def load_models():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
clip_model, preprocess = torch.hub.load('openai/CLIP', 'clip_vit_b32', device=device)
text_model = SentenceTransformer("all-MiniLM-L6-v2").to(device)
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=0 if torch.cuda.is_available() else -1)
return clip_model, preprocess, text_model, qa_model
clip_model, preprocess, text_model, qa_model = load_models()
# Streamlit UI
st.title("Enhanced YouTube Video Analysis")
video_url = st.text_input("Enter YouTube Video URL")
if st.button("Analyze"):
with st.spinner("Processing video..."):
video_id = get_video_id(video_url)
results = process_video(video_url, "output_dir", video_id)
if results:
st.success("Video processed successfully!")
# Text search and question answering
st.subheader("Text Search and Q&A")
query = st.text_input("Enter a search query or question")
if query:
# Placeholder for text search and QA
st.write("Text search and QA functionality to be implemented")
# Image upload and similarity search
st.subheader("Image Search")
uploaded_image = st.file_uploader("Upload an image to find similar frames", type=["jpg", "jpeg", "png"])
if uploaded_image:
# Placeholder for image search
st.write("Image search functionality to be implemented")
# Face upload and recognition
st.subheader("Face Search")
uploaded_face = st.file_uploader("Upload a face image to find appearances", type=["jpg", "jpeg", "png"])
if uploaded_face:
face_image = face_recognition.load_image_file(uploaded_face)
face_encoding = face_recognition.face_encodings(face_image)[0]
face_appearances = []
face_frames = []
for i, frame in enumerate(results['frames']):
face_locations = face_recognition.face_locations(frame)
face_encodings = face_recognition.face_encodings(frame, face_locations)
for encoding in face_encodings:
if face_recognition.compare_faces([face_encoding], encoding)[0]:
face_appearances.append(results['frame_times'][i])
face_frames.append(frame)
st.write(f"Face appearances found at {len(face_appearances)} timestamps.")
if face_frames:
# Create a temporary directory to store frames
with tempfile.TemporaryDirectory() as temp_dir:
# Save frames as images
for i, frame in enumerate(face_frames):
cv2.imwrite(os.path.join(temp_dir, f"frame_{i:04d}.jpg"), frame)
# Use FFmpeg to create a video from the frames
output_video = "face_appearances.mp4"
ffmpeg_command = [
"ffmpeg",
"-framerate", str(results['fps']),
"-i", os.path.join(temp_dir, "frame_%04d.jpg"),
"-c:v", "libx264",
"-pix_fmt", "yuv420p",
output_video
]
subprocess.run(ffmpeg_command, check=True)
# Display the generated video
st.video(output_video)
# Provide download link for the video
with open(output_video, "rb") as file:
btn = st.download_button(
label="Download Face Appearances Video",
data=file,
file_name="face_appearances.mp4",
mime="video/mp4"
)
else:
st.write("No frames with the uploaded face were found in the video.")
# Display original video
st.subheader("Original Video")
st.video(results['video_path'])
else:
st.warning("Please enter a valid YouTube URL and click 'Analyze'")