import gradio as gr from ultralytics import YOLOv10 from skimage.metrics import structural_similarity as ssim import cv2 import torch import os import spaces device = 'cuda' if torch.cuda.is_available() else 'cpu' model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) # Define activity categories based on detected objects activity_categories = { "Working": ["laptop", "computer", "keyboard", "office chair"], "Meal Time": ["fork", "spoon", "plate", "food"], "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], "Outdoors": ["car", "tree", "bicycle", "road"], # Add more categories and objects as needed } # Function to map detected objects to categorized activities def categorize_activity(detected_objects): categorized_activities = {} for activity, objects in activity_categories.items(): if any(obj in detected_objects for obj in objects): if activity not in categorized_activities: categorized_activities[activity] = [] categorized_activities[activity].append(detected_objects) return categorized_activities # Function to compare frames using SSIM to avoid repeated frames def is_frame_different(frame1, frame2, threshold=0.9): gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) score, _ = ssim(gray_frame1, gray_frame2, full=True) return score < threshold # Function to process the video, detect objects, and generate a categorized journal with images @spaces.GPU def generate_journal_with_images(video_path, frame_interval=30): cap = cv2.VideoCapture(video_path) journal_entries = [] image_paths = [] frame_count = 0 last_processed_frame = None output_folder = "detected_frames" os.makedirs(output_folder, exist_ok=True) # Create folder to store images while cap.isOpened(): ret, frame = cap.read() if not ret: break # Process every Nth frame or if the current frame is different from the last processed frame if frame_count % frame_interval == 0 or (last_processed_frame is not None and is_frame_different(last_processed_frame, frame)): frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Make predictions using YOLOv10 on the current frame results = model.predict(source=frame_rgb, device=device) # Plot bounding boxes and labels on the image annotated_frame = results[0].plot() # Plot detection results on the frame # Save the annotated image frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving image_paths.append(frame_filename) # Extract labels (class indices) and map them to class names detected_objects = [model.names[int(box.cls)] for box in results[0].boxes] # Access the first result # Get current timestamp in the video timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds # Categorize the detected objects into activities activity_summary = categorize_activity(detected_objects) # Store the activities with their timestamp for activity, objects in activity_summary.items(): journal_entries.append(f"At {timestamp:.2f} seconds: {', '.join(objects[0])}") last_processed_frame = frame # Update the last processed frame frame_count += 1 cap.release() # Debug print to verify the return values print(f"journal_entries: {journal_entries}") print(f"image_paths: {image_paths}") return journal_entries, image_paths def display_journal_with_images(video): journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30) journal_text = "\n".join(journal_entries) return journal_text, image_paths with gr.Blocks() as iface: video_input = gr.Video(label="Upload Video", height=300) journal_output = gr.Textbox(label="Generated Daily Journal", lines=10) image_gallery = gr.Gallery(label="Annotated Frames") run_button = gr.Button("Generate Journal") run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery]) iface.launch() iface.launch()