import os
os.system("pip install -U ultralytics open_clip_torch numpy opencv-python gradio")
import spaces
import gradio as gr
from ultralytics import YOLO
import torch
import open_clip
import cv2
import numpy as np
from PIL import Image
import gradio as gr
from ultralytics import YOLO
import torch
import open_clip
import cv2
import numpy as np
from PIL import Image
import os
# Load YOLO model
yolo_model = YOLO("yolov8s-worldv2.pt")
yolo_model.set_classes(["electric-bike", "electric-bicycle", "e-bike", "bicycle"])
# Set up device and CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("RN50-quickgelu", pretrained="openai", device=device)
model.eval()
tokenizer = open_clip.get_tokenizer("RN50-quickgelu")
# Define classes and pretty classes
classes = ["an e-bike", "a bicycle", "an electric-bike", "an electric-bicycle"]
pretty_classes = ["e-bike", "bicycle", "e-bike", "e-bike"]
# Function to calculate IoU
def calculate_iou(box1, box2):
x1_1, y1_1, x2_1, y2_1 = box1
x1_2, y1_2, x2_2, y2_2 = box2
xi1 = max(x1_1, x1_2)
yi1 = max(y1_1, y1_2)
xi2 = min(x2_1, x2_2)
yi2 = min(y2_1, y2_2)
inter_area = max(xi2 - xi1, 0) * max(yi2 - yi1, 0)
box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
union_area = box1_area + box2_area - inter_area
return inter_area / union_area if union_area != 0 else 0
# Function to process a frame
def process_frame(frame):
img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
height, width, _ = frame.shape
font_scale = max(width, height) / 1000
thickness = int(max(width, height) / 400)
overlay = frame.copy()
results = yolo_model(img_rgb, iou=0.5, conf=0.25)
boxes, confidences, class_ids = [], [], []
for result in results:
for box in result.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0])
confidence = box.conf.item()
class_id = int(box.cls.item())
boxes.append((x1, y1, x2, y2))
confidences.append(confidence)
class_ids.append(class_id)
to_remove = set()
for i in range(len(boxes)):
for j in range(i + 1, len(boxes)):
iou = calculate_iou(boxes[i], boxes[j])
if iou > 0.5:
if confidences[i] > confidences[j]:
to_remove.add(j)
else:
to_remove.add(i)
boxes = [box for idx, box in enumerate(boxes) if idx not in to_remove]
confidences = [conf for idx, conf in enumerate(confidences) if idx not in to_remove]
class_ids = [cls for idx, cls in enumerate(class_ids) if idx not in to_remove]
for box, confidence, class_id in zip(boxes, confidences, class_ids):
x1, y1, x2, y2 = box
cropped_img = img_rgb[y1:y2, x1:x2]
image_input = preprocess(Image.fromarray(cropped_img)).unsqueeze(0).to(device)
text_inputs = tokenizer(classes).to(device)
with torch.no_grad():
image_features = model.encode_image(image_input)
text_features = model.encode_text(text_inputs)
logits_per_image = image_features @ text_features.T
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
predicted_class = pretty_classes[probs.argmax()]
label = f"{predicted_class} {probs.max():.2f}"
# Define new colors
box_color = (0, 255, 255) # Cyan for the box
border_color = (0, 0, 0) # Black for the border
label_bg_color = (0, 160, 160) # Dark blue for the label background
label_text_color = (255, 255, 255) # White for the text
# Draw a thicker black rectangle as the border
border_thickness = thickness + 1
cv2.rectangle(frame, (x1, y1), (x2, y2), border_color, border_thickness)
# Draw the inner cyan rectangle
cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, thickness)
# Label background with border
text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)[0]
label_width, label_height = text_size
label_x, label_y = x1, y1 - 5
label_bg_x1, label_bg_y1 = label_x - 3, label_y - label_height - 3
label_bg_x2, label_bg_y2 = label_x + label_width + 3, label_y + 3
# Draw dark blue label background
cv2.rectangle(frame, (label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2), label_bg_color, -1)
# Draw label text
cv2.putText(
frame, label, (label_x, label_y),
cv2.FONT_HERSHEY_SIMPLEX, font_scale, label_text_color, thickness
)
return frame
# Function to detect objects in an image
@spaces.GPU
def detect_objects_image(image):
image_np = np.array(image)
processed_image = process_frame(image_np)
return Image.fromarray(processed_image)
# Function to detect objects in a video
@spaces.GPU
def detect_objects_video(video_file):
cap = cv2.VideoCapture(video_file)
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
processed_frame = process_frame(frame)
frames.append(processed_frame)
cap.release()
height, width, _ = frames[0].shape
out = cv2.VideoWriter("/tmp/annotated_video.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height))
for frame in frames:
out.write(frame)
out.release()
return "/tmp/annotated_video.mp4"
# Global title
global_title = "DeepSense.ai"
# Dynamically load example files from the examples directory
example_dir = "examples"
example_images = []
example_videos = []
if os.path.exists(example_dir):
for file in os.listdir(example_dir):
file_path = os.path.join(example_dir, file)
if file.lower().endswith((".jpg", ".jpeg", ".png")):
example_images.append([file_path])
elif file.lower().endswith((".mp4", ".avi", ".mov")):
example_videos.append([file_path])
# Image upload interface
file_upload_demo = gr.Interface(
fn=detect_objects_image,
inputs=gr.Image(type="pil", label="Input"),
outputs=gr.Image(type="pil", label="Output"),
title="Image Upload - Bicycle and E-Bike Detection Model",
description=(
""
"- Step 1: Upload an image or select an example.
"
"- Step 2: Press Submit
and the detection model will label the image."
""
),
examples=example_images,
allow_flagging="never"
)
# Video upload interface
video_file_demo = gr.Interface(
fn=detect_objects_video,
inputs=gr.Video(label="Input"),
outputs=gr.Video(label="Output"),
title="Video Upload - Bicycle and E-Bike Detection Model",
description=(
""
"- Step 1: Upload a video or select an example.
"
"- Step 2: Press Submit
and the detection model will label the video."
""
),
examples=example_videos,
allow_flagging="never"
)
# Live video interface
live_video_demo = gr.Interface(
fn=detect_objects_image,
inputs=gr.Image(sources=["webcam"], streaming=True, label="Input"),
outputs=gr.Image(type="pil", label="Output"),
live=True,
title="Live Video - Bicycle and E-Bike Detection Model",
description=(
""
"- Step 1: Use your webcam to capture live video.
"
"- Step 2: The detection model will label the video in real-time."
""
),
allow_flagging="never"
)
# Main Gradio app
with gr.Blocks(title=global_title) as demo:
gr.Markdown("