Abhilashvj commited on
Commit
f279bf5
1 Parent(s): db13e81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -149
app.py CHANGED
@@ -1,162 +1,177 @@
1
  import streamlit as st
2
- import torch
3
- from PIL import Image
4
- import face_recognition
5
  import faiss
 
6
  from sentence_transformers import SentenceTransformer
7
- from transformers import pipeline
 
 
8
  import cv2
9
- import numpy as np
10
- import subprocess
11
- import tempfile
12
- import os
13
- import yt_dlp
14
- from moviepy.editor import VideoFileClip
15
-
16
- # Helper functions
17
- def get_video_id(url):
18
- return url.split("v=")[1].split("&")[0]
19
-
20
- def download_youtube_video(url, output_path):
21
- ydl_opts = {
22
- 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
23
- 'outtmpl': os.path.join(output_path, '%(id)s.%(ext)s'),
24
- }
25
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
26
- info = ydl.extract_info(url, download=True)
27
- filename = ydl.prepare_filename(info)
28
- return filename
29
-
30
- def process_video(video_url, output_dir, video_id):
31
- # Placeholder for video processing logic
32
- # This should include face detection, object detection, transcription, etc.
33
- # For now, we'll just download the video
34
- video_path = download_youtube_video(video_url, output_dir)
35
-
36
- # Extract frames (simplified version)
37
- video = cv2.VideoCapture(video_path)
38
- fps = video.get(cv2.CAP_PROP_FPS)
39
- frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
40
- duration = frame_count / fps
41
-
42
- frames = []
43
- frame_times = []
44
- for i in range(0, frame_count, int(fps)): # Extract one frame per second
45
- video.set(cv2.CAP_PROP_POS_FRAMES, i)
46
- ret, frame = video.read()
47
- if ret:
48
- frames.append(frame)
49
- frame_times.append(i / fps)
50
-
51
- video.release()
52
-
53
- return {
54
- 'video_path': video_path,
55
- 'frames': frames,
56
- 'frame_times': frame_times,
57
- 'duration': duration,
58
- 'fps': fps
59
- }
60
-
61
- def search(query, index_path, metadata_path, model):
62
- # Placeholder for search functionality
63
- # This should use FAISS for efficient similarity search
64
- return [], []
65
 
66
  # Load models
67
  @st.cache_resource
68
  def load_models():
69
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
70
- clip_model, preprocess = torch.hub.load('openai/CLIP', 'clip_vit_b32', device=device)
71
- text_model = SentenceTransformer("all-MiniLM-L6-v2").to(device)
72
- qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=0 if torch.cuda.is_available() else -1)
73
- return clip_model, preprocess, text_model, qa_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- clip_model, preprocess, text_model, qa_model = load_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Streamlit UI
78
- st.title("Enhanced YouTube Video Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- video_url = st.text_input("Enter YouTube Video URL")
81
- if st.button("Analyze"):
82
- with st.spinner("Processing video..."):
83
- video_id = get_video_id(video_url)
84
- results = process_video(video_url, "output_dir", video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- if results:
87
- st.success("Video processed successfully!")
88
-
89
- # Text search and question answering
90
- st.subheader("Text Search and Q&A")
91
- query = st.text_input("Enter a search query or question")
92
- if query:
93
- # Placeholder for text search and QA
94
- st.write("Text search and QA functionality to be implemented")
95
-
96
- # Image upload and similarity search
97
- st.subheader("Image Search")
98
- uploaded_image = st.file_uploader("Upload an image to find similar frames", type=["jpg", "jpeg", "png"])
99
- if uploaded_image:
100
- # Placeholder for image search
101
- st.write("Image search functionality to be implemented")
102
-
103
- # Face upload and recognition
104
- st.subheader("Face Search")
105
- uploaded_face = st.file_uploader("Upload a face image to find appearances", type=["jpg", "jpeg", "png"])
106
- if uploaded_face:
107
- face_image = face_recognition.load_image_file(uploaded_face)
108
- face_encoding = face_recognition.face_encodings(face_image)[0]
109
-
110
- face_appearances = []
111
- face_frames = []
112
-
113
- for i, frame in enumerate(results['frames']):
114
- face_locations = face_recognition.face_locations(frame)
115
- face_encodings = face_recognition.face_encodings(frame, face_locations)
116
-
117
- for encoding in face_encodings:
118
- if face_recognition.compare_faces([face_encoding], encoding)[0]:
119
- face_appearances.append(results['frame_times'][i])
120
- face_frames.append(frame)
121
-
122
- st.write(f"Face appearances found at {len(face_appearances)} timestamps.")
123
-
124
- if face_frames:
125
- # Create a temporary directory to store frames
126
- with tempfile.TemporaryDirectory() as temp_dir:
127
- # Save frames as images
128
- for i, frame in enumerate(face_frames):
129
- cv2.imwrite(os.path.join(temp_dir, f"frame_{i:04d}.jpg"), frame)
130
-
131
- # Use FFmpeg to create a video from the frames
132
- output_video = "face_appearances.mp4"
133
- ffmpeg_command = [
134
- "ffmpeg",
135
- "-framerate", str(results['fps']),
136
- "-i", os.path.join(temp_dir, "frame_%04d.jpg"),
137
- "-c:v", "libx264",
138
- "-pix_fmt", "yuv420p",
139
- output_video
140
- ]
141
- subprocess.run(ffmpeg_command, check=True)
142
-
143
- # Display the generated video
144
- st.video(output_video)
145
-
146
- # Provide download link for the video
147
- with open(output_video, "rb") as file:
148
- btn = st.download_button(
149
- label="Download Face Appearances Video",
150
- data=file,
151
- file_name="face_appearances.mp4",
152
- mime="video/mp4"
153
- )
154
- else:
155
- st.write("No frames with the uploaded face were found in the video.")
156
-
157
- # Display original video
158
- st.subheader("Original Video")
159
- st.video(results['video_path'])
160
-
161
- else:
162
- st.warning("Please enter a valid YouTube URL and click 'Analyze'")
 
1
  import streamlit as st
2
+ import json
 
 
3
  import faiss
4
+ import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
+ import base64
7
+ from PIL import Image
8
+ import io
9
  import cv2
10
+ from insightface.app import FaceAnalysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Load models
13
  @st.cache_resource
14
  def load_models():
15
+ text_model = SentenceTransformer("all-MiniLM-L6-v2")
16
+ image_model = SentenceTransformer("clip-ViT-B-32")
17
+ face_app = FaceAnalysis(providers=['CPUExecutionProvider'])
18
+ face_app.prepare(ctx_id=0, det_size=(640, 640))
19
+ return text_model, image_model, face_app
20
+
21
+ text_model, image_model, face_app = load_models()
22
+
23
+ # Load data
24
+ @st.cache_data
25
+ def load_data(video_id):
26
+ with open(f"{video_id}_summary.json", "r") as f:
27
+ summary = json.load(f)
28
+ with open(f"{video_id}_transcription.json", "r") as f:
29
+ transcription = json.load(f)
30
+ with open(f"{video_id}_text_metadata.json", "r") as f:
31
+ text_metadata = json.load(f)
32
+ with open(f"{video_id}_image_metadata.json", "r") as f:
33
+ image_metadata = json.load(f)
34
+ with open(f"{video_id}_object_infos.json", "r") as f:
35
+ object_infos = json.load(f)
36
+ with open(f"{video_id}_face_metadata.json", "r") as f:
37
+ face_metadata = json.load(f)
38
+ return summary, transcription, text_metadata, image_metadata, object_infos, face_metadata
39
+
40
+ video_id = "IMFUOexuEXw"
41
+ summary, transcription, text_metadata, image_metadata, object_infos, face_metadata = load_data(video_id)
42
+
43
+ # Load FAISS indexes
44
+ @st.cache_resource
45
+ def load_indexes(video_id):
46
+ text_index = faiss.read_index(f"{video_id}_text_index.faiss")
47
+ image_index = faiss.read_index(f"{video_id}_image_index.faiss")
48
+ face_index = faiss.read_index(f"{video_id}_face_index.faiss")
49
+ return text_index, image_index, face_index
50
 
51
+ text_index, image_index, face_index = load_indexes(video_id)
52
+
53
+ # Search functions
54
+ def text_search(query, index, metadata, model, n_results=5):
55
+ query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
56
+ D, I = index.search(query_vector, n_results)
57
+ results = [metadata[i] for i in I[0]]
58
+ return results, D[0]
59
+
60
+ def image_search(image, index, metadata, model, n_results=5):
61
+ image_vector = model.encode(image, convert_to_tensor=True).cpu().numpy()
62
+ D, I = index.search(image_vector.reshape(1, -1), n_results)
63
+ results = [metadata[i] for i in I[0]]
64
+ return results, D[0]
65
+
66
+ def face_search(face_embedding, index, metadata, n_results=5):
67
+ D, I = index.search(np.array(face_embedding).reshape(1, -1), n_results)
68
+ results = [metadata[i] for i in I[0]]
69
+ return results, D[0]
70
+
71
+ def detect_and_embed_face(image, face_app):
72
+ img_array = np.array(image)
73
+ faces = face_app.get(img_array)
74
+ if len(faces) == 0:
75
+ return None
76
+ largest_face = max(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))
77
+ return largest_face.embedding
78
 
79
  # Streamlit UI
80
+ st.title("Video Analysis Dashboard")
81
+
82
+ # Display video summary
83
+ st.header("Video Summary")
84
+ st.subheader("Prominent Faces")
85
+ for face in summary['prominent_faces']:
86
+ st.write(f"Face ID: {face['id']}, Appearances: {face['appearances']}, First Appearance: {face['first_appearance']:.2f}s")
87
+ if 'thumbnail' in face:
88
+ image = Image.open(io.BytesIO(base64.b64decode(face['thumbnail'])))
89
+ st.image(image, caption=f"Face ID: {face['id']}", width=100)
90
+
91
+ st.subheader("Prominent Objects")
92
+ for obj in summary['prominent_objects']:
93
+ st.write(f"Object ID: {obj['id']}, Appearances: {obj['appearances']}, Representative Frame: {obj['representative_frame']:.2f}s")
94
+
95
+ st.subheader("Themes")
96
+ for theme in summary['themes']:
97
+ st.write(f"Theme ID: {theme['id']}, Keywords: {', '.join(theme['keywords'])}")
98
 
99
+ # Search functionality
100
+ st.header("Search")
101
+
102
+ search_type = st.selectbox("Select search type", ["Text", "Face", "Image"])
103
+
104
+ if search_type == "Text":
105
+ query = st.text_input("Enter your search query")
106
+ search_target = st.multiselect("Search in", ["Transcript", "Frames"], default=["Transcript"])
107
+ if st.button("Search"):
108
+ if "Transcript" in search_target:
109
+ text_results, text_distances = text_search(query, text_index, text_metadata, text_model)
110
+ st.subheader("Transcript Search Results")
111
+ for result, distance in zip(text_results, text_distances):
112
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
113
+ st.write(f"Text: {result['text']}")
114
+ st.write("---")
115
+ if "Frames" in search_target:
116
+ frame_results, frame_distances = text_search(query, image_index, image_metadata, image_model)
117
+ st.subheader("Frame Search Results")
118
+ for result, distance in zip(frame_results, frame_distances):
119
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
120
+ st.write("---")
121
+
122
+ elif search_type == "Face":
123
+ face_search_type = st.radio("Choose face search method", ["Select from video", "Upload image"])
124
 
125
+ if face_search_type == "Select from video":
126
+ face_id = st.selectbox("Select a face", [face['id'] for face in summary['prominent_faces']])
127
+ if st.button("Search"):
128
+ selected_face = next(face for face in summary['prominent_faces'] if face['id'] == face_id)
129
+ face_results, face_distances = face_search(selected_face['embedding'], face_index, face_metadata)
130
+ st.subheader("Face Search Results")
131
+ for result, distance in zip(face_results, face_distances):
132
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
133
+ st.write(f"Face ID: {result['face_id']}")
134
+ st.write("---")
135
+ else:
136
+ uploaded_file = st.file_uploader("Choose a face image...", type=["jpg", "jpeg", "png"])
137
+ if uploaded_file is not None:
138
+ image = Image.open(uploaded_file)
139
+ st.image(image, caption="Uploaded Image", use_column_width=True)
140
+ if st.button("Search"):
141
+ face_embedding = detect_and_embed_face(image, face_app)
142
+ if face_embedding is not None:
143
+ face_results, face_distances = face_search(face_embedding, face_index, face_metadata)
144
+ st.subheader("Face Search Results")
145
+ for result, distance in zip(face_results, face_distances):
146
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
147
+ st.write(f"Face ID: {result['face_id']}")
148
+ st.write("---")
149
+ else:
150
+ st.error("No face detected in the uploaded image. Please try another image.")
151
+
152
+ elif search_type == "Image":
153
+ image_search_type = st.radio("Choose image search method", ["Upload image", "Text description"])
154
+
155
+ if image_search_type == "Upload image":
156
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
157
+ if uploaded_file is not None:
158
+ image = Image.open(uploaded_file)
159
+ st.image(image, caption="Uploaded Image", use_column_width=True)
160
+ if st.button("Search"):
161
+ image_results, image_distances = image_search(image, image_index, image_metadata, image_model)
162
+ st.subheader("Image Search Results")
163
+ for result, distance in zip(image_results, image_distances):
164
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
165
+ st.write("---")
166
+ else:
167
+ text_query = st.text_input("Enter a description of the image you're looking for")
168
+ if st.button("Search"):
169
+ image_results, image_distances = text_search(text_query, image_index, image_metadata, image_model)
170
+ st.subheader("Image Search Results")
171
+ for result, distance in zip(image_results, image_distances):
172
+ st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
173
+ st.write("---")
174
+
175
+ # Display transcription
176
+ st.header("Video Transcription")
177
+ st.write(transcription['transcription'])