import os os.system("pip install -U ultralytics open_clip_torch numpy opencv-python gradio") import spaces import gradio as gr from ultralytics import YOLO import torch import open_clip import cv2 import numpy as np from PIL import Image import gradio as gr from ultralytics import YOLO import torch import open_clip import cv2 import numpy as np from PIL import Image import os # Load YOLO model yolo_model = YOLO("yolov8s-worldv2.pt") yolo_model.set_classes(["electric-bike", "electric-bicycle", "e-bike", "bicycle"]) # Set up device and CLIP model device = "cuda" if torch.cuda.is_available() else "cpu" model, _, preprocess = open_clip.create_model_and_transforms("RN50-quickgelu", pretrained="openai", device=device) model.eval() tokenizer = open_clip.get_tokenizer("RN50-quickgelu") # Define classes and pretty classes classes = ["an e-bike", "a bicycle", "an electric-bike", "an electric-bicycle"] pretty_classes = ["e-bike", "bicycle", "e-bike", "e-bike"] # Function to calculate IoU def calculate_iou(box1, box2): x1_1, y1_1, x2_1, y2_1 = box1 x1_2, y1_2, x2_2, y2_2 = box2 xi1 = max(x1_1, x1_2) yi1 = max(y1_1, y1_2) xi2 = min(x2_1, x2_2) yi2 = min(y2_1, y2_2) inter_area = max(xi2 - xi1, 0) * max(yi2 - yi1, 0) box1_area = (x2_1 - x1_1) * (y2_1 - y1_1) box2_area = (x2_2 - x1_2) * (y2_2 - y1_2) union_area = box1_area + box2_area - inter_area return inter_area / union_area if union_area != 0 else 0 # Function to process a frame def process_frame(frame): img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) height, width, _ = frame.shape font_scale = max(width, height) / 1000 thickness = int(max(width, height) / 400) overlay = frame.copy() results = yolo_model(img_rgb, iou=0.5, conf=0.25) boxes, confidences, class_ids = [], [], [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) confidence = box.conf.item() class_id = int(box.cls.item()) boxes.append((x1, y1, x2, y2)) confidences.append(confidence) class_ids.append(class_id) to_remove = set() for i in range(len(boxes)): for j in range(i + 1, len(boxes)): iou = calculate_iou(boxes[i], boxes[j]) if iou > 0.5: if confidences[i] > confidences[j]: to_remove.add(j) else: to_remove.add(i) boxes = [box for idx, box in enumerate(boxes) if idx not in to_remove] confidences = [conf for idx, conf in enumerate(confidences) if idx not in to_remove] class_ids = [cls for idx, cls in enumerate(class_ids) if idx not in to_remove] for box, confidence, class_id in zip(boxes, confidences, class_ids): x1, y1, x2, y2 = box cropped_img = img_rgb[y1:y2, x1:x2] image_input = preprocess(Image.fromarray(cropped_img)).unsqueeze(0).to(device) text_inputs = tokenizer(classes).to(device) with torch.no_grad(): image_features = model.encode_image(image_input) text_features = model.encode_text(text_inputs) logits_per_image = image_features @ text_features.T probs = logits_per_image.softmax(dim=-1).cpu().numpy() predicted_class = pretty_classes[probs.argmax()] label = f"{predicted_class} {probs.max():.2f}" # Define new colors box_color = (0, 255, 255) # Cyan for the box border_color = (0, 0, 0) # Black for the border label_bg_color = (0, 160, 160) # Dark blue for the label background label_text_color = (255, 255, 255) # White for the text # Draw a thicker black rectangle as the border border_thickness = thickness + 1 cv2.rectangle(frame, (x1, y1), (x2, y2), border_color, border_thickness) # Draw the inner cyan rectangle cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, thickness) # Label background with border text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)[0] label_width, label_height = text_size label_x, label_y = x1, y1 - 5 label_bg_x1, label_bg_y1 = label_x - 3, label_y - label_height - 3 label_bg_x2, label_bg_y2 = label_x + label_width + 3, label_y + 3 # Draw dark blue label background cv2.rectangle(frame, (label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2), label_bg_color, -1) # Draw label text cv2.putText( frame, label, (label_x, label_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, label_text_color, thickness ) return frame # Function to detect objects in an image @spaces.GPU def detect_objects_image(image): image_np = np.array(image) processed_image = process_frame(image_np) return Image.fromarray(processed_image) # Function to detect objects in a video @spaces.GPU def detect_objects_video(video_file): cap = cv2.VideoCapture(video_file) frames = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break processed_frame = process_frame(frame) frames.append(processed_frame) cap.release() height, width, _ = frames[0].shape out = cv2.VideoWriter("/tmp/annotated_video.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height)) for frame in frames: out.write(frame) out.release() return "/tmp/annotated_video.mp4" # Global title global_title = "DeepSense.ai" # Dynamically load example files from the examples directory example_dir = "examples" example_images = [] example_videos = [] if os.path.exists(example_dir): for file in os.listdir(example_dir): file_path = os.path.join(example_dir, file) if file.lower().endswith((".jpg", ".jpeg", ".png")): example_images.append([file_path]) elif file.lower().endswith((".mp4", ".avi", ".mov")): example_videos.append([file_path]) # Image upload interface file_upload_demo = gr.Interface( fn=detect_objects_image, inputs=gr.Image(type="pil", label="Input"), outputs=gr.Image(type="pil", label="Output"), title="Image Upload - Bicycle and E-Bike Detection Model", description=( "" "- Step 1: Upload an image or select an example.
" "- Step 2: Press Submit and the detection model will label the image." "
" ), examples=example_images, allow_flagging="never" ) # Video upload interface video_file_demo = gr.Interface( fn=detect_objects_video, inputs=gr.Video(label="Input"), outputs=gr.Video(label="Output"), title="Video Upload - Bicycle and E-Bike Detection Model", description=( "" "- Step 1: Upload a video or select an example.
" "- Step 2: Press Submit and the detection model will label the video." "
" ), examples=example_videos, allow_flagging="never" ) # Live video interface live_video_demo = gr.Interface( fn=detect_objects_image, inputs=gr.Image(sources=["webcam"], streaming=True, label="Input"), outputs=gr.Image(type="pil", label="Output"), live=True, title="Live Video - Bicycle and E-Bike Detection Model", description=( "" "- Step 1: Use your webcam to capture live video.
" "- Step 2: The detection model will label the video in real-time." "
" ), allow_flagging="never" ) # Main Gradio app with gr.Blocks(title=global_title) as demo: gr.Markdown("

DeepSense.ai

") gr.TabbedInterface( [file_upload_demo, video_file_demo, live_video_demo], ["Image Upload", "Video Upload", "Live Video"], ) gr.Markdown( "
" "Click any example image above to upload it to the e-bike detection model." "
" ) # Launch the app if __name__ == "__main__": demo.launch(share=True)