import gradio as gr from ultralytics import YOLOv10 from skimage.metrics import structural_similarity as ssim import cv2 import torch import os import spaces import markdown import requests import torch import io from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer device = 'cuda' if torch.cuda.is_available() else 'cpu' model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_vision = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content" def extract_assistant_reply(input_string): # Define the tag that indicates the start of the assistant's reply start_tag = "<|start_header_id|>assistant<|end_header_id|>" # Find the position where the assistant's reply starts start_index = input_string.find(start_tag) if start_index == -1: return "Assistant's reply not found." start_index += len(start_tag) # Extract everything after the start tag assistant_reply = input_string[start_index:].strip() return assistant_reply def extract_json_from_markdown(markdown_text): try: start_idx = markdown_text.find('```') end_idx = markdown_text.find('```', start_idx + 3) if markdown_text[start_idx:start_idx + 7] == '```html': start_idx += len('```html') else: start_idx += len('```') # Extract and clean up the code block (json or not) json_str = markdown_text[start_idx:end_idx].strip() # Try to load it as JSON return json.loads(json_str) except Exception as e: print(f"Error extracting JSON: {e}") return None @spaces.GPU def generate_image_desc(image): messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": SYSTEM_INSTRUCTION} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(image, input_text, return_tensors="pt").to(model.device) # Generate the output from the model output = model_vision.generate(**inputs, max_new_tokens=300) print(output) markdown_text = processor.decode(output[0]) print(markdown_text) markdown_text=extract_assistant_reply(markdown_text) html_output = markdown.markdown(markdown_text) return html_output @spaces.GPU def generate_journal_infographics(journal): prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}" messages = [ {"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."}, {"role": "user", "content": prompt} ] # Prepare inputs for the model text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # Generate the documentation generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(documentation) return documentation # Define activity categories based on detected objects activity_categories = { "Working": ["laptop", "computer", "keyboard", "office chair"], "Meal Time": ["fork", "spoon", "plate", "food"], "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"], # Add more categories and objects as needed } # Function to map detected objects to categorized activities def categorize_activity(detected_objects): categorized_activities = {} for activity, objects in activity_categories.items(): if any(obj in detected_objects for obj in objects): if activity not in categorized_activities: categorized_activities[activity] = [] categorized_activities[activity].append(detected_objects) return categorized_activities # Function to compare frames using SSIM to avoid repeated frames def is_frame_different(frame1, frame2, threshold=0.9): gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) score, _ = ssim(gray_frame1, gray_frame2, full=True) return score < threshold # Function to process the video, detect objects, and generate a categorized journal with images @spaces.GPU def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8): cap = cv2.VideoCapture(video_path) journal_entries = [] image_paths = [] frame_count = 0 output_folder = "detected_frames" os.makedirs(output_folder, exist_ok=True) # Create folder to store images last_processed_second = -1 # Keep track of the last processed second while cap.isOpened(): ret, frame = cap.read() if not ret: break # Get the current timestamp in the video current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds current_second = int(current_time) # Round down to the nearest second # Process only one frame per second if current_second > last_processed_second: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Make predictions using YOLOv10 on the current frame results = model.predict(source=frame_rgb, device=device) # Filter detected objects based on confidence threshold detected_objects = [] for box in results[0].boxes: if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8 detected_objects.append(model.names[int(box.cls)]) # Only process frames where objects with confidence >= threshold are detected if detected_objects: # If there are high-confidence detected objects # Plot bounding boxes and labels on the image annotated_frame = results[0].plot() # Plot detection results on the frame # Save the annotated image frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving image_paths.append(frame_filename) # Categorize the detected objects into activities activity_summary = categorize_activity(detected_objects) # Store the activities with their timestamp for activity, objects in activity_summary.items(): journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}") last_processed_second = current_second # Update the last processed second frame_count += 1 cap.release() return journal_entries, image_paths def display_journal_with_images(video): journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30) pil_images = [] for image_path in image_paths: # Read the image using OpenCV image = cv2.imread(image_path) # Convert the image from BGR (OpenCV) to RGB (PIL) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert the NumPy array to a PIL image pil_image = Image.fromarray(image_rgb) pil_images.append(pil_image) infograhic_html='' if len(pil_images) >= 2: #just for mockup first_frame_detail=generate_image_desc(pil_images[0]) infograhic_html=generate_journal_infographics(first_frame_detail) journal_text = "\n".join(journal_entries) return journal_text, image_paths,infograhic_html with gr.Blocks() as iface: video_input = gr.Video(label="Upload Video", height=300) journal_output = gr.Textbox(label="Generated Daily Journal", lines=10) image_gallery = gr.Gallery(label="Annotated Frames") run_button = gr.Button("Generate Journal") infographic_html=gr.HTML() run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html]) iface.launch()