import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from transformers import pipeline
import cv2
import numpy as np
import tempfile
import os

# Initialize the object detection pipeline
object_detector = pipeline("object-detection",
                         model="facebook/detr-resnet-50")

def draw_bounding_boxes(frame, detections):
    """
    Draws bounding boxes on the video frame based on the detections.
    """
    # Convert numpy array to PIL Image
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    draw = ImageDraw.Draw(pil_image)
    
    # Use default font
    font = ImageFont.load_default()

    for detection in detections:
        box = detection['box']
        xmin = int(box['xmin'])
        ymin = int(box['ymin'])
        xmax = int(box['xmax'])
        ymax = int(box['ymax'])

        # Draw the bounding box
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

        # Create label with score
        label = detection['label']
        score = detection['score']
        text = f"{label} {score:.2f}"

        # Draw text with background rectangle for visibility
        text_bbox = draw.textbbox((xmin, ymin), text, font=font)
        draw.rectangle([
            (text_bbox[0], text_bbox[1]),
            (text_bbox[2], text_bbox[3])
        ], fill="red")
        draw.text((xmin, ymin), text, fill="white", font=font)

    # Convert back to numpy array
    frame_with_boxes = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return frame_with_boxes

def process_video(video_path):
    """
    Process the video file and return the path to the processed video
    """
    try:
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return None

        # Get video properties
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        # Create temporary file for output video
        temp_output = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
        output_path = temp_output.name
        temp_output.close()

        # Initialize video writer
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

        frame_count = 0
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Process every nth frame to speed up processing
        process_every_n_frames = 2  # Adjust this value to process more or fewer frames
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            
            # Only process every nth frame
            if frame_count % process_every_n_frames == 0:
                # Convert frame to RGB for the model
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                # Detect objects
                detections = object_detector(frame_rgb)
                
                # Draw bounding boxes
                frame = draw_bounding_boxes(frame, detections)
            
            # Write the frame
            out.write(frame)
            
            # Print progress
            progress = (frame_count / total_frames) * 100
            print(f"Processing: {progress:.1f}% complete", end='\r')

        # Release everything
        cap.release()
        out.release()
        
        return output_path
        
    except Exception as e:
        print(f"Error processing video: {str(e)}")
        return None

def detect_objects_in_video(video):
    """
    Gradio interface function for video object detection
    """
    if video is None:
        return None
    
    try:
        # Process the video
        output_path = process_video(video)
        if output_path is None:
            return None
            
        return output_path
        
    except Exception as e:
        print(f"Error during video processing: {str(e)}")
        return None

# Create the Gradio interface
demo = gr.Interface(
    fn=detect_objects_in_video,
    inputs=[
        gr.Video(label="Upload Video")
    ],
    outputs=[
        gr.Video(label="Processed Video")
    ],
    title="Video Object Detection",
    description="""
    Upload a video to detect and track objects within it. 
    The application will process the video and draw bounding boxes around detected objects 
    with their labels and confidence scores.
    Note: Processing may take some time depending on the video length.
    """
)

if __name__ == "__main__":
    demo.launch()