Spaces:

yasserrmd
/

DailySnap

Running on Zero

File size: 4,842 Bytes

100b9b2
 
109fee8
100b9b2
d48f382
 
100b9b2
 
01e83c8
9c69830
 
100b9b2
d48f382
100b9b2
 
 
 
a9777c0
d48f382
100b9b2
 
 
 
d48f382
100b9b2
 
 
d48f382
 
 
100b9b2
d48f382
100b9b2
8a4dc7e
 
 
 
 
 
 
 
 
d48f382
100b9b2
f5586b0
d48f382
47f97bd
84def21
d48f382
fd362dd
d48f382
 
100b9b2
 
 
 
 
 
fd362dd
 
 
 
 
 
 
a9777c0
 
 
 
 
fd362dd
a9777c0
 
6fb0ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd362dd
 
d48f382
 
100b9b2
 
 
a9777c0
100b9b2
d5b8a83
d48f382
d5b8a83
d48f382
84def21
d5b8a83
 
47f97bd
84def21
d48f382
d5b8a83
 
da6e971
d48f382
 
d5b8a83
 
 
100b9b2

import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)

# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    image_paths = []
    frame_count = 0
    last_processed_frame = None
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process every Nth frame or if the current frame is different from the last processed frame
        if frame_count % frame_interval == 0 or (last_processed_frame is not None and is_frame_different(last_processed_frame, frame)):
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Filter detected objects based on confidence threshold
            detected_objects = []
            for box in results[0].boxes:
                if box.conf >= confidence_threshold:  # Only include objects with confidence >= 0.8
                    detected_objects.append(model.names[int(box.cls)])
            
            # Only process frames where objects with confidence >= threshold are detected
            if detected_objects:  # If there are high-confidence detected objects
                
                # Plot bounding boxes and labels on the image
                annotated_frame = results[0].plot()  # Plot detection results on the frame
                
                # Save the annotated image
                frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
                cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
                image_paths.append(frame_filename)
                
                # Get current timestamp in the video
                timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
                
                # Categorize the detected objects into activities
                activity_summary = categorize_activity(detected_objects)
                
                # Store the activities with their timestamp
                for activity, objects in activity_summary.items():
                    journal_entries.append(f"At {timestamp:.2f} seconds: {', '.join(objects[0])}")
            
            last_processed_frame = frame  # Update the last processed frame
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries, image_paths 


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    

    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths


with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames")
    run_button = gr.Button("Generate Journal")
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery])

iface.launch()

iface.launch()