import gradio as gr from ultralytics import YOLOv10 from skimage.metrics import structural_similarity as ssim import cv2 import torch import os import spaces import markdown import requests import torch import io from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer device = 'cuda' if torch.cuda.is_available() else 'cpu' model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device) model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_vision = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time" def extract_assistant_reply(input_string): # Define the tag that indicates the start of the assistant's reply start_tag = "<|start_header_id|>assistant<|end_header_id|>" # Find the position where the assistant's reply starts start_index = input_string.find(start_tag) if start_index == -1: return "Assistant's reply not found." start_index += len(start_tag) # Extract everything after the start tag assistant_reply = input_string[start_index:].strip() return assistant_reply def extract_json_from_markdown(markdown_text): try: start_idx = markdown_text.find('```') end_idx = markdown_text.find('```', start_idx + 3) if markdown_text[start_idx:start_idx + 7] == '```html': start_idx += len('```html') else: start_idx += len('```') # Extract and clean up the code block (json or not) json_str = markdown_text[start_idx:end_idx].strip() # Try to load it as JSON return json.loads(json_str) except Exception as e: print(f"Error extracting JSON: {e}") return None @spaces.GPU def generate_image_desc(image): messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": SYSTEM_INSTRUCTION} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(image, input_text, return_tensors="pt").to(model.device) # Generate the output from the model output = model_vision.generate(**inputs, max_new_tokens=300) print(output) markdown_text = processor.decode(output[0]) print(markdown_text) markdown_text=extract_assistant_reply(markdown_text) html_output = markdown.markdown(markdown_text) return html_output @spaces.GPU def generate_journal_infographics(journal): prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}" messages = [ {"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"}, {"role": "user", "content": prompt} ] # Prepare inputs for the model text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # Generate the documentation generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(documentation) return documentation # Define activity categories based on detected objects activity_categories = { "Working": ["laptop", "computer", "keyboard", "office chair"], "Meal Time": ["fork", "spoon", "plate", "food"], "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"], "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"], # Add more categories and objects as needed } # Function to map detected objects to categorized activities def categorize_activity(detected_objects): categorized_activities = {} for activity, objects in activity_categories.items(): if any(obj in detected_objects for obj in objects): if activity not in categorized_activities: categorized_activities[activity] = [] categorized_activities[activity].append(detected_objects) return categorized_activities # Function to compare frames using SSIM to avoid repeated frames def is_frame_different(frame1, frame2, threshold=0.9): gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) score, _ = ssim(gray_frame1, gray_frame2, full=True) return score < threshold # Function to process the video, detect objects, and generate a categorized journal with images @spaces.GPU def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8): cap = cv2.VideoCapture(video_path) journal_entries = [] image_paths = [] frame_count = 0 output_folder = "detected_frames" os.makedirs(output_folder, exist_ok=True) # Create folder to store images last_processed_second = -1 # Keep track of the last processed second while cap.isOpened(): ret, frame = cap.read() if not ret: break # Get the current timestamp in the video current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds current_second = int(current_time) # Round down to the nearest second # Process only one frame per second if current_second > last_processed_second: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Make predictions using YOLOv10 on the current frame results = model.predict(source=frame_rgb, device=device) # Filter detected objects based on confidence threshold detected_objects = [] for box in results[0].boxes: if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8 detected_objects.append(model.names[int(box.cls)]) # Only process frames where objects with confidence >= threshold are detected if detected_objects: # If there are high-confidence detected objects # Plot bounding boxes and labels on the image annotated_frame = results[0].plot() # Plot detection results on the frame # Save the annotated image frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg") cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving image_paths.append(frame_filename) # Categorize the detected objects into activities activity_summary = categorize_activity(detected_objects) # Store the activities with their timestamp for activity, objects in activity_summary.items(): journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}") last_processed_second = current_second # Update the last processed second frame_count += 1 cap.release() return journal_entries, image_paths def display_journal_with_images(video): journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30) pil_images = [] for image_path in image_paths: # Read the image using OpenCV image = cv2.imread(image_path) # Convert the image from BGR (OpenCV) to RGB (PIL) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert the NumPy array to a PIL image pil_image = Image.fromarray(image_rgb) pil_images.append(pil_image) infograhic_html='' if len(pil_images) >= 2: #just for mockup first_frame_detail=generate_image_desc(pil_images[0]) second_frame_detail=generate_image_desc(pil_images[0]) infograhic_html=generate_journal_infographics(first_frame_detail + "\n\n" + second_frame_detail) journal_text = "\n".join(journal_entries) return journal_text, image_paths,infograhic_html with gr.Blocks() as iface: video_input = gr.Video(label="Upload Video", height=300) journal_output = gr.Textbox(label="Generated Daily Journal", lines=10) image_gallery = gr.Gallery(label="Annotated Frames") run_button = gr.Button("Generate Journal") infographic_html=gr.HTML() run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html]) iface.launch()