import gradio as gr from ultralytics import YOLOv10 import cv2 import torch import os import spaces device = 'cuda' if torch.cuda.is_available() else 'cpu' model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) # Define activity categories based on detected objects activity_categories = { "Working": ["laptop", "computer", "keyboard", "office chair"], "Meal Time": ["fork", "spoon", "plate", "food"], "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], "Outdoors": ["car", "tree", "bicycle", "road"], # Add more categories and objects as needed } # Function to map detected objects to categorized activities def categorize_activity(detected_objects): categorized_activities = {} for activity, objects in activity_categories.items(): if any(obj in detected_objects for obj in objects): if activity not in categorized_activities: categorized_activities[activity] = [] categorized_activities[activity].append(detected_objects) return categorized_activities # Function to compare frames using SSIM to avoid repeated frames def is_frame_different(frame1, frame2, threshold=0.9): gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) score, _ = ssim(gray_frame1, gray_frame2, full=True) return score < threshold # Function to process the video, detect objects, and generate a categorized journal with images @spaces.GPU def generate_journal_with_images(video_path, frame_interval=30): cap = cv2.VideoCapture(video_path) journal_entries = {} saved_images = [] frame_count = 0 last_processed_frame = None output_folder = "detected_frames" os.makedirs(output_folder, exist_ok=True) # Create folder to store images while cap.isOpened(): ret, frame = cap.read() if not ret: break # Process every Nth frame or if the current frame is different from the last processed frame if frame_count % frame_interval == 0 or (last_processed_frame is not None and is_frame_different(last_processed_frame, frame)): frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Make predictions using YOLOv10 on the current frame results = model.predict(source=frame_rgb, device=device) # Plot bounding boxes and labels on the image annotated_frame = results[0].plot() # Plot detection results on the frame # Save the annotated image frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving saved_images.append(frame_filename) # Extract labels (class indices) and map them to class names detected_objects = [model.names[int(box.cls)] for box in results[0].boxes] # Access the first result # Get current timestamp in the video timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds # Categorize the detected objects into activities activity_summary = categorize_activity(detected_objects) # Store the activities with their timestamp for activity, objects in activity_summary.items(): if activity not in journal_entries: journal_entries[activity] = [] journal_entries[activity].append((f"At {timestamp:.2f} seconds: {', '.join(objects[0])}", frame_filename)) last_processed_frame = frame # Update the last processed frame frame_count += 1 cap.release() # Create a formatted journal output formatted_journal = [] for activity, entries in journal_entries.items(): formatted_journal.append(f"**{activity}:**") for entry, image_path in entries: formatted_journal.append((entry, image_path)) return formatted_journal # Gradio interface for uploading video and generating journal with images def display_journal_with_images(video): journal_with_images = generate_journal_with_images(video) # Create the final display with text and images display_items = [] for entry, image_path in journal_with_images: display_items.append((entry, image_path)) return display_items with gr.Blocks() as iface: video_input = gr.Video(label="Upload Video", height=300) output_gallery = gr.Gallery(label="Generated Daily Journal with Images") run_button = gr.Button("Generate Journal") run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=output_gallery) iface.launch()