Spaces:
Running
Running
File size: 6,078 Bytes
06499c0 3f1f1a2 06499c0 3f1f1a2 06499c0 3f1f1a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import gradio as gr
import torch
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection, ResNetModel
from torchvision import transforms
from PIL import Image
import cv2
import torch.nn.functional as F
import tempfile
import os
# Load models
resnet = ResNetModel.from_pretrained("microsoft/resnet-50")
resnet.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = resnet.to(device)
mixin = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = mixin.to(device)
# Preprocess the image
def preprocess_image(image):
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
return transform(image).unsqueeze(0)
def extract_embedding(image):
image_tensor = preprocess_image(image).to(device)
with torch.no_grad():
output = resnet(image_tensor)
embedding = output.pooler_output
return embedding
def cosine_similarity(embedding1, embedding2):
return F.cosine_similarity(embedding1, embedding2)
def l2_distance(embedding1, embedding2):
return torch.norm(embedding1 - embedding2, p=2)
def save_array_to_temp_image(arr):
rgb_arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
img = Image.fromarray(rgb_arr)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
temp_file_name = temp_file.name
temp_file.close()
img.save(temp_file_name)
return temp_file_name
def detect_and_crop(target_image, query_image, threshold=0.6, nms_threshold=0.3):
target_sizes = torch.Tensor([target_image.size[::-1]])
inputs = processor(images=target_image, query_images=query_image, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.image_guided_detection(**inputs)
img = cv2.cvtColor(np.array(target_image), cv2.COLOR_BGR2RGB)
outputs.logits = outputs.logits.cpu()
outputs.target_pred_boxes = outputs.target_pred_boxes.cpu()
results = processor.post_process_image_guided_detection(outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes)
boxes, scores = results[0]["boxes"], results[0]["scores"]
if len(boxes) == 0:
return []
filtered_boxes = []
for box in boxes:
x1, y1, x2, y2 = [int(i) for i in box.tolist()]
cropped_img = img[y1:y2, x1:x2]
if cropped_img.size != 0:
filtered_boxes.append(cropped_img)
return filtered_boxes
def process_video(video_path, query_image, skipframes=0):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return
frame_count = 0
all_results = []
while True:
ret, frame = cap.read()
if not ret:
break
if frame_count % (skipframes + 1) == 0:
frame_file = save_array_to_temp_image(frame)
result_frames = detect_and_crop(Image.open(frame_file), query_image)
for res in result_frames:
saved_res = save_array_to_temp_image(res)
embedding1 = extract_embedding(query_image)
embedding2 = extract_embedding(Image.open(saved_res))
dist = l2_distance(embedding1, embedding2).item()
cos = cosine_similarity(embedding1, embedding2).item()
all_results.append({'l2_dist': dist, 'cos': cos})
frame_count += 1
cap.release()
return all_results
def process_videos_and_compare(image, video, skipframes=5, threshold=0.47):
def median(values):
n = len(values)
return (values[n // 2 - 1] + values[n // 2]) / 2 if n % 2 == 0 else values[n // 2]
results = process_video(video, image, skipframes)
if results:
l2_dists = [item['l2_dist'] for item in results]
cosines = [item['cos'] for item in results]
avg_l2_dist = sum(l2_dists) / len(l2_dists)
avg_cos = sum(cosines) / len(cosines)
median_l2_dist = median(sorted(l2_dists))
median_cos = median(sorted(cosines))
result = {
"avg_l2_dist": avg_l2_dist,
"avg_cos": avg_cos,
"median_l2_dist": median_l2_dist,
"median_cos": median_cos,
"avg_cos_dist": 1 - avg_cos,
"median_cos_dist": 1 - median_cos,
"is_present": avg_cos >= threshold
}
else:
result = {
"avg_l2_dist": float('inf'),
"avg_cos": 0,
"median_l2_dist": float('inf'),
"median_cos": 0,
"avg_cos_dist": float('inf'),
"median_cos_dist": float('inf'),
"is_present": False
}
return result
def interface(video, image, skipframes, threshold):
result = process_videos_and_compare(image, video, skipframes, threshold)
return result
iface = gr.Interface(
fn=interface,
inputs=[
gr.Video(label="Upload a Video"),
gr.Image(type="pil", label="Upload a Query Image"),
gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Skip Frames"),
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.47, label="Threshold")
],
outputs=[
gr.JSON(label="Result")
],
title="Object Detection in Video",
description="""
**Instructions:**
1. **Upload a Video**: Select a video file to upload.
2. **Upload a Query Image**: Select an image file that contains the object you want to detect in the video.
3. **Set Skip Frames**: Adjust the slider to set the number of frames to skip between each processing.
4. **Set Threshold**: Adjust the slider to set the threshold for cosine similarity to determine if the object is present in the video.
5. **View Results**: The result will show the average and median distances and similarities, and whether the object is present in the video based on the threshold.
"""
)
if __name__ == "__main__":
iface.launch() |