Spaces:

yasserrmd
/

DailySnap

Running on Zero

File size: 4,858 Bytes

import gradio as gr
from ultralytics import YOLOv10
import cv2
import torch
import os
import spaces


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)

# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    journal_entries = {}
    saved_images = []
    frame_count = 0
    last_processed_frame = None
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process every Nth frame or if the current frame is different from the last processed frame
        if frame_count % frame_interval == 0 or (last_processed_frame is not None and is_frame_different(last_processed_frame, frame)):
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Plot bounding boxes and labels on the image
            annotated_frame = results[0].plot()  # Plot detection results on the frame
            
            # Save the annotated image
            frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
            saved_images.append(frame_filename)
            
            # Extract labels (class indices) and map them to class names
            detected_objects = [model.names[int(box.cls)] for box in results[0].boxes]  # Access the first result
            
            # Get current timestamp in the video
            timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
            
            # Categorize the detected objects into activities
            activity_summary = categorize_activity(detected_objects)
            
            # Store the activities with their timestamp
            for activity, objects in activity_summary.items():
                if activity not in journal_entries:
                    journal_entries[activity] = []
                journal_entries[activity].append((f"At {timestamp:.2f} seconds: {', '.join(objects[0])}", frame_filename))
            
            last_processed_frame = frame  # Update the last processed frame
        
        frame_count += 1
    
    cap.release()
    
    # Create a formatted journal output
    formatted_journal = []
    for activity, entries in journal_entries.items():
        formatted_journal.append(f"**{activity}:**")
        for entry, image_path in entries:
            formatted_journal.append((entry, image_path))
    
    return formatted_journal

# Gradio interface for uploading video and generating journal with images
def display_journal_with_images(video):
    journal_with_images = generate_journal_with_images(video)
    
    # Create the final display with text and images
    display_items = []
    for entry, image_path in journal_with_images:
        display_items.append((entry, image_path))
    
    return display_items

with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)
    output_gallery = gr.Gallery(label="Generated Daily Journal with Images")
    run_button = gr.Button("Generate Journal")
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=output_gallery)

iface.launch()