Abhilashvj commited on
Commit
4a3a5b5
1 Parent(s): f279bf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -89
app.py CHANGED
@@ -8,17 +8,17 @@ from PIL import Image
8
  import io
9
  import cv2
10
  from insightface.app import FaceAnalysis
 
11
 
12
  # Load models
13
  @st.cache_resource
14
  def load_models():
15
- text_model = SentenceTransformer("all-MiniLM-L6-v2")
16
- image_model = SentenceTransformer("clip-ViT-B-32")
17
  face_app = FaceAnalysis(providers=['CPUExecutionProvider'])
18
  face_app.prepare(ctx_id=0, det_size=(640, 640))
19
- return text_model, image_model, face_app
20
 
21
- text_model, image_model, face_app = load_models()
22
 
23
  # Load data
24
  @st.cache_data
@@ -27,41 +27,35 @@ def load_data(video_id):
27
  summary = json.load(f)
28
  with open(f"{video_id}_transcription.json", "r") as f:
29
  transcription = json.load(f)
30
- with open(f"{video_id}_text_metadata.json", "r") as f:
31
- text_metadata = json.load(f)
32
- with open(f"{video_id}_image_metadata.json", "r") as f:
33
- image_metadata = json.load(f)
34
- with open(f"{video_id}_object_infos.json", "r") as f:
35
- object_infos = json.load(f)
36
  with open(f"{video_id}_face_metadata.json", "r") as f:
37
  face_metadata = json.load(f)
38
- return summary, transcription, text_metadata, image_metadata, object_infos, face_metadata
39
 
40
  video_id = "IMFUOexuEXw"
41
- summary, transcription, text_metadata, image_metadata, object_infos, face_metadata = load_data(video_id)
 
42
 
43
  # Load FAISS indexes
44
  @st.cache_resource
45
  def load_indexes(video_id):
46
- text_index = faiss.read_index(f"{video_id}_text_index.faiss")
47
- image_index = faiss.read_index(f"{video_id}_image_index.faiss")
48
  face_index = faiss.read_index(f"{video_id}_face_index.faiss")
49
- return text_index, image_index, face_index
50
 
51
- text_index, image_index, face_index = load_indexes(video_id)
52
 
53
  # Search functions
54
- def text_search(query, index, metadata, model, n_results=5):
55
- query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
 
 
 
 
56
  D, I = index.search(query_vector, n_results)
57
- results = [metadata[i] for i in I[0]]
58
- return results, D[0]
59
-
60
- def image_search(image, index, metadata, model, n_results=5):
61
- image_vector = model.encode(image, convert_to_tensor=True).cpu().numpy()
62
- D, I = index.search(image_vector.reshape(1, -1), n_results)
63
- results = [metadata[i] for i in I[0]]
64
- return results, D[0]
65
 
66
  def face_search(face_embedding, index, metadata, n_results=5):
67
  D, I = index.search(np.array(face_embedding).reshape(1, -1), n_results)
@@ -76,48 +70,73 @@ def detect_and_embed_face(image, face_app):
76
  largest_face = max(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))
77
  return largest_face.embedding
78
 
 
 
 
 
 
 
79
  # Streamlit UI
80
  st.title("Video Analysis Dashboard")
81
 
82
- # Display video summary
83
- st.header("Video Summary")
84
- st.subheader("Prominent Faces")
85
- for face in summary['prominent_faces']:
86
- st.write(f"Face ID: {face['id']}, Appearances: {face['appearances']}, First Appearance: {face['first_appearance']:.2f}s")
87
- if 'thumbnail' in face:
88
- image = Image.open(io.BytesIO(base64.b64decode(face['thumbnail'])))
89
- st.image(image, caption=f"Face ID: {face['id']}", width=100)
90
-
91
- st.subheader("Prominent Objects")
92
- for obj in summary['prominent_objects']:
93
- st.write(f"Object ID: {obj['id']}, Appearances: {obj['appearances']}, Representative Frame: {obj['representative_frame']:.2f}s")
94
-
95
- st.subheader("Themes")
96
- for theme in summary['themes']:
97
- st.write(f"Theme ID: {theme['id']}, Keywords: {', '.join(theme['keywords'])}")
 
 
 
 
 
 
 
 
98
 
99
  # Search functionality
100
  st.header("Search")
101
 
102
- search_type = st.selectbox("Select search type", ["Text", "Face", "Image"])
103
 
104
- if search_type == "Text":
105
- query = st.text_input("Enter your search query")
106
- search_target = st.multiselect("Search in", ["Transcript", "Frames"], default=["Transcript"])
107
- if st.button("Search"):
108
- if "Transcript" in search_target:
109
- text_results, text_distances = text_search(query, text_index, text_metadata, text_model)
110
- st.subheader("Transcript Search Results")
111
- for result, distance in zip(text_results, text_distances):
112
- st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
113
- st.write(f"Text: {result['text']}")
114
- st.write("---")
115
- if "Frames" in search_target:
116
- frame_results, frame_distances = text_search(query, image_index, image_metadata, image_model)
117
- st.subheader("Frame Search Results")
118
- for result, distance in zip(frame_results, frame_distances):
119
- st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
120
  st.write("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  elif search_type == "Face":
123
  face_search_type = st.radio("Choose face search method", ["Select from video", "Upload image"])
@@ -130,7 +149,8 @@ elif search_type == "Face":
130
  st.subheader("Face Search Results")
131
  for result, distance in zip(face_results, face_distances):
132
  st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
133
- st.write(f"Face ID: {result['face_id']}")
 
134
  st.write("---")
135
  else:
136
  uploaded_file = st.file_uploader("Choose a face image...", type=["jpg", "jpeg", "png"])
@@ -144,34 +164,8 @@ elif search_type == "Face":
144
  st.subheader("Face Search Results")
145
  for result, distance in zip(face_results, face_distances):
146
  st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
147
- st.write(f"Face ID: {result['face_id']}")
 
148
  st.write("---")
149
  else:
150
- st.error("No face detected in the uploaded image. Please try another image.")
151
-
152
- elif search_type == "Image":
153
- image_search_type = st.radio("Choose image search method", ["Upload image", "Text description"])
154
-
155
- if image_search_type == "Upload image":
156
- uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
157
- if uploaded_file is not None:
158
- image = Image.open(uploaded_file)
159
- st.image(image, caption="Uploaded Image", use_column_width=True)
160
- if st.button("Search"):
161
- image_results, image_distances = image_search(image, image_index, image_metadata, image_model)
162
- st.subheader("Image Search Results")
163
- for result, distance in zip(image_results, image_distances):
164
- st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
165
- st.write("---")
166
- else:
167
- text_query = st.text_input("Enter a description of the image you're looking for")
168
- if st.button("Search"):
169
- image_results, image_distances = text_search(text_query, image_index, image_metadata, image_model)
170
- st.subheader("Image Search Results")
171
- for result, distance in zip(image_results, image_distances):
172
- st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
173
- st.write("---")
174
-
175
- # Display transcription
176
- st.header("Video Transcription")
177
- st.write(transcription['transcription'])
 
8
  import io
9
  import cv2
10
  from insightface.app import FaceAnalysis
11
+ from moviepy.editor import VideoFileClip
12
 
13
  # Load models
14
  @st.cache_resource
15
  def load_models():
16
+ unified_model = SentenceTransformer("clip-ViT-B-32")
 
17
  face_app = FaceAnalysis(providers=['CPUExecutionProvider'])
18
  face_app.prepare(ctx_id=0, det_size=(640, 640))
19
+ return unified_model, face_app
20
 
21
+ unified_model, face_app = load_models()
22
 
23
  # Load data
24
  @st.cache_data
 
27
  summary = json.load(f)
28
  with open(f"{video_id}_transcription.json", "r") as f:
29
  transcription = json.load(f)
30
+ with open(f"{video_id}_unified_metadata.json", "r") as f:
31
+ unified_metadata = json.load(f)
 
 
 
 
32
  with open(f"{video_id}_face_metadata.json", "r") as f:
33
  face_metadata = json.load(f)
34
+ return summary, transcription, unified_metadata, face_metadata
35
 
36
  video_id = "IMFUOexuEXw"
37
+ video_path = f"{video_id}.mp4"
38
+ summary, transcription, unified_metadata, face_metadata = load_data(video_id)
39
 
40
  # Load FAISS indexes
41
  @st.cache_resource
42
  def load_indexes(video_id):
43
+ unified_index = faiss.read_index(f"{video_id}_unified_index.faiss")
 
44
  face_index = faiss.read_index(f"{video_id}_face_index.faiss")
45
+ return unified_index, face_index
46
 
47
+ unified_index, face_index = load_indexes(video_id)
48
 
49
  # Search functions
50
+ def unified_search(query, index, metadata, model, n_results=5):
51
+ if isinstance(query, str):
52
+ query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
53
+ else: # Assume it's an image
54
+ query_vector = model.encode(query, convert_to_tensor=True).cpu().numpy()
55
+
56
  D, I = index.search(query_vector, n_results)
57
+ results = [{'data': metadata[i], 'distance': d} for i, d in zip(I[0], D[0])]
58
+ return results
 
 
 
 
 
 
59
 
60
  def face_search(face_embedding, index, metadata, n_results=5):
61
  D, I = index.search(np.array(face_embedding).reshape(1, -1), n_results)
 
70
  largest_face = max(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))
71
  return largest_face.embedding
72
 
73
+ def create_video_clip(video_path, start_time, end_time, output_path):
74
+ with VideoFileClip(video_path) as video:
75
+ new_clip = video.subclip(start_time, end_time)
76
+ new_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
77
+ return output_path
78
+
79
  # Streamlit UI
80
  st.title("Video Analysis Dashboard")
81
 
82
+ # Sidebar with scrollable transcript
83
+ st.sidebar.header("Video Transcript")
84
+ transcript_text = transcription['transcription']
85
+ st.sidebar.text_area("Full Transcript", transcript_text, height=300)
86
+
87
+ # Main content
88
+ col1, col2 = st.columns([2, 1])
89
+
90
+ with col1:
91
+ st.header("Video Player")
92
+ st.video(video_path)
93
+
94
+ with col2:
95
+ st.header("Video Summary")
96
+ st.subheader("Prominent Faces")
97
+ for face in summary['prominent_faces']:
98
+ st.write(f"Face ID: {face['id']}, Appearances: {face['appearances']}")
99
+ if 'thumbnail' in face:
100
+ image = Image.open(io.BytesIO(base64.b64decode(face['thumbnail'])))
101
+ st.image(image, caption=f"Face ID: {face['id']}", width=100)
102
+
103
+ st.subheader("Themes")
104
+ for theme in summary['themes']:
105
+ st.write(f"Theme ID: {theme['id']}, Keywords: {', '.join(theme['keywords'])}")
106
 
107
  # Search functionality
108
  st.header("Search")
109
 
110
+ search_type = st.selectbox("Select search type", ["Unified", "Face"])
111
 
112
+ if search_type == "Unified":
113
+ search_method = st.radio("Choose search method", ["Text", "Image"])
114
+
115
+ if search_method == "Text":
116
+ query = st.text_input("Enter your search query")
117
+ if st.button("Search"):
118
+ results = unified_search(query, unified_index, unified_metadata, unified_model)
119
+ st.subheader("Search Results")
120
+ for result in results:
121
+ st.write(f"Time: {result['data']['start']:.2f}s - {result['data']['end']:.2f}s, Distance: {result['distance']:.4f}")
122
+ if 'text' in result['data']:
123
+ st.write(f"Text: {result['data']['text']}")
124
+ clip_path = create_video_clip(video_path, result['data']['start'], result['data']['end'], f"temp_clip_{result['data']['start']}.mp4")
125
+ st.video(clip_path)
 
 
126
  st.write("---")
127
+ else:
128
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
129
+ if uploaded_file is not None:
130
+ image = Image.open(uploaded_file)
131
+ st.image(image, caption="Uploaded Image", use_column_width=True)
132
+ if st.button("Search"):
133
+ results = unified_search(image, unified_index, unified_metadata, unified_model)
134
+ st.subheader("Image Search Results")
135
+ for result in results:
136
+ st.write(f"Time: {result['data']['start']:.2f}s - {result['data']['end']:.2f}s, Distance: {result['distance']:.4f}")
137
+ clip_path = create_video_clip(video_path, result['data']['start'], result['data']['end'], f"temp_clip_{result['data']['start']}.mp4")
138
+ st.video(clip_path)
139
+ st.write("---")
140
 
141
  elif search_type == "Face":
142
  face_search_type = st.radio("Choose face search method", ["Select from video", "Upload image"])
 
149
  st.subheader("Face Search Results")
150
  for result, distance in zip(face_results, face_distances):
151
  st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
152
+ clip_path = create_video_clip(video_path, result['start'], result['end'], f"temp_face_clip_{result['start']}.mp4")
153
+ st.video(clip_path)
154
  st.write("---")
155
  else:
156
  uploaded_file = st.file_uploader("Choose a face image...", type=["jpg", "jpeg", "png"])
 
164
  st.subheader("Face Search Results")
165
  for result, distance in zip(face_results, face_distances):
166
  st.write(f"Time: {result['start']:.2f}s - {result['end']:.2f}s, Distance: {distance:.4f}")
167
+ clip_path = create_video_clip(video_path, result['start'], result['end'], f"temp_face_clip_{result['start']}.mp4")
168
+ st.video(clip_path)
169
  st.write("---")
170
  else:
171
+ st.error("No face detected in the uploaded image. Please try another image.")