import os
os.system("pip install -U ultralytics open_clip_torch numpy opencv-python gradio")

import spaces
import gradio as gr
from ultralytics import YOLO
import torch
import open_clip
import cv2
import numpy as np
from PIL import Image
import gradio as gr
from ultralytics import YOLO
import torch
import open_clip
import cv2
import numpy as np
from PIL import Image
import os

# Load YOLO model
yolo_model = YOLO("yolov8s-worldv2.pt")
yolo_model.set_classes(["electric-bike", "electric-bicycle", "e-bike", "bicycle"])

# Set up device and CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("RN50-quickgelu", pretrained="openai", device=device)
model.eval()
tokenizer = open_clip.get_tokenizer("RN50-quickgelu")

# Define classes and pretty classes
classes = ["an e-bike", "a bicycle", "an electric-bike", "an electric-bicycle"]
pretty_classes = ["e-bike", "bicycle", "e-bike", "e-bike"]

# Function to calculate IoU
def calculate_iou(box1, box2):
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    xi1 = max(x1_1, x1_2)
    yi1 = max(y1_1, y1_2)
    xi2 = min(x2_1, x2_2)
    yi2 = min(y2_1, y2_2)
    inter_area = max(xi2 - xi1, 0) * max(yi2 - yi1, 0)
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    union_area = box1_area + box2_area - inter_area
    return inter_area / union_area if union_area != 0 else 0

# Function to process a frame
def process_frame(frame):
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    height, width, _ = frame.shape
    font_scale = max(width, height) / 1000
    thickness = int(max(width, height) / 400)
    overlay = frame.copy()

    results = yolo_model(img_rgb, iou=0.5, conf=0.25)
    boxes, confidences, class_ids = [], [], []

    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            confidence = box.conf.item()
            class_id = int(box.cls.item())
            boxes.append((x1, y1, x2, y2))
            confidences.append(confidence)
            class_ids.append(class_id)

    to_remove = set()
    for i in range(len(boxes)):
        for j in range(i + 1, len(boxes)):
            iou = calculate_iou(boxes[i], boxes[j])
            if iou > 0.5:
                if confidences[i] > confidences[j]:
                    to_remove.add(j)
                else:
                    to_remove.add(i)

    boxes = [box for idx, box in enumerate(boxes) if idx not in to_remove]
    confidences = [conf for idx, conf in enumerate(confidences) if idx not in to_remove]
    class_ids = [cls for idx, cls in enumerate(class_ids) if idx not in to_remove]

    for box, confidence, class_id in zip(boxes, confidences, class_ids):
        x1, y1, x2, y2 = box
        cropped_img = img_rgb[y1:y2, x1:x2]
        image_input = preprocess(Image.fromarray(cropped_img)).unsqueeze(0).to(device)
        text_inputs = tokenizer(classes).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_inputs)
            logits_per_image = image_features @ text_features.T
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()

        predicted_class = pretty_classes[probs.argmax()]
        label = f"{predicted_class} {probs.max():.2f}"

        # Define new colors
        box_color = (0, 255, 255)  # Cyan for the box
        border_color = (0, 0, 0)  # Black for the border
        label_bg_color = (0, 160, 160)  # Dark blue for the label background
        label_text_color = (255, 255, 255)  # White for the text

        # Draw a thicker black rectangle as the border
        border_thickness = thickness + 1
        cv2.rectangle(frame, (x1, y1), (x2, y2), border_color, border_thickness)

        # Draw the inner cyan rectangle
        cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, thickness)

        # Label background with border
        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)[0]
        label_width, label_height = text_size
        label_x, label_y = x1, y1 - 5
        label_bg_x1, label_bg_y1 = label_x - 3, label_y - label_height - 3
        label_bg_x2, label_bg_y2 = label_x + label_width + 3, label_y + 3

        # Draw dark blue label background
        cv2.rectangle(frame, (label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2), label_bg_color, -1)

        # Draw label text
        cv2.putText(
            frame, label, (label_x, label_y),
            cv2.FONT_HERSHEY_SIMPLEX, font_scale, label_text_color, thickness
        )

    return frame

# Function to detect objects in an image
@spaces.GPU
def detect_objects_image(image):
    image_np = np.array(image)
    processed_image = process_frame(image_np)
    return Image.fromarray(processed_image)

# Function to detect objects in a video
@spaces.GPU
def detect_objects_video(video_file):
    cap = cv2.VideoCapture(video_file)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        processed_frame = process_frame(frame)
        frames.append(processed_frame)
    cap.release()
    height, width, _ = frames[0].shape
    out = cv2.VideoWriter("/tmp/annotated_video.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height))
    for frame in frames:
        out.write(frame)
    out.release()
    return "/tmp/annotated_video.mp4"

# Global title
global_title = "DeepSense.ai"

# Dynamically load example files from the examples directory
example_dir = "examples"
example_images = []
example_videos = []

if os.path.exists(example_dir):
    for file in os.listdir(example_dir):
        file_path = os.path.join(example_dir, file)
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            example_images.append([file_path])
        elif file.lower().endswith((".mp4", ".avi", ".mov")):
            example_videos.append([file_path])

# Image upload interface
file_upload_demo = gr.Interface(
    fn=detect_objects_image,
    inputs=gr.Image(type="pil", label="Input"),
    outputs=gr.Image(type="pil", label="Output"),
    title="Image Upload - Bicycle and E-Bike Detection Model",
    description=(
        "<span style='font-size: 17px;'>"
        "- <b>Step 1</b>: Upload an image or select an example.<br>"
        "- <b>Step 2</b>: Press <code>Submit</code> and the detection model will label the image."
        "</span>"
    ),
    examples=example_images,
    allow_flagging="never"
)

# Video upload interface
video_file_demo = gr.Interface(
    fn=detect_objects_video,
    inputs=gr.Video(label="Input"),
    outputs=gr.Video(label="Output"),
    title="Video Upload - Bicycle and E-Bike Detection Model",
    description=(
        "<span style='font-size: 17px;'>"
        "- <b>Step 1</b>: Upload a video or select an example.<br>"
        "- <b>Step 2</b>: Press <code>Submit</code> and the detection model will label the video."
        "</span>"
    ),
    examples=example_videos,
    allow_flagging="never"
)

# Live video interface
live_video_demo = gr.Interface(
    fn=detect_objects_image,
    inputs=gr.Image(sources=["webcam"], streaming=True, label="Input"),
    outputs=gr.Image(type="pil", label="Output"),
    live=True,
    title="Live Video - Bicycle and E-Bike Detection Model",
    description=(
        "<span style='font-size: 17px;'>"
        "- <b>Step 1</b>: Use your webcam to capture live video.<br>"
        "- <b>Step 2</b>: The detection model will label the video in real-time."
        "</span>"
    ),
    allow_flagging="never"
)

# Main Gradio app
with gr.Blocks(title=global_title) as demo:
    gr.Markdown("<h1 style='text-align: center;'>DeepSense.ai</h1>")
    gr.TabbedInterface(
        [file_upload_demo, video_file_demo, live_video_demo],
        ["Image Upload", "Video Upload", "Live Video"],
    )
    gr.Markdown(
        "<div style='text-align: center; font-size: 15px; margin-top: 20px;'>"
        "Click any example image above to upload it to the e-bike detection model."
        "</div>"
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)