Spaces:

yasserrmd
/

DailySnap

Running on Zero

File size: 10,391 Bytes

import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)



model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model_vision = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)


SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content"

def extract_assistant_reply(input_string):
    # Define the tag that indicates the start of the assistant's reply
    start_tag = "<|start_header_id|>assistant<|end_header_id|>"
    # Find the position where the assistant's reply starts
    start_index = input_string.find(start_tag)
    if start_index == -1:
        return "Assistant's reply not found."
    start_index += len(start_tag)
    # Extract everything after the start tag
    assistant_reply = input_string[start_index:].strip()
    return assistant_reply

def extract_json_from_markdown(markdown_text):
    try:
        start_idx = markdown_text.find('```')
        end_idx = markdown_text.find('```', start_idx + 3)
        
       
        if markdown_text[start_idx:start_idx + 7] == '```html':
            start_idx += len('```html')
        else:
            start_idx += len('```')

        # Extract and clean up the code block (json or not)
        json_str = markdown_text[start_idx:end_idx].strip()

        # Try to load it as JSON
        return json.loads(json_str)
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return None



@spaces.GPU
def generate_image_desc(image):
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": SYSTEM_INSTRUCTION}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)

    # Generate the output from the model
    output = model_vision.generate(**inputs, max_new_tokens=300)
    print(output)
    markdown_text = processor.decode(output[0])
    print(markdown_text)
        
    markdown_text=extract_assistant_reply(markdown_text)
    html_output = markdown.markdown(markdown_text)
    return html_output

@spaces.GPU
def generate_journal_infographics(journal):
    prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
    
    messages = [
        {"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."},
        {"role": "user", "content": prompt}
    ]
    
    # Prepare inputs for the model
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generate the documentation
    generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(documentation)
    return documentation



# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    image_paths = []
    frame_count = 0
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images

    last_processed_second = -1  # Keep track of the last processed second
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp in the video
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
        current_second = int(current_time)  # Round down to the nearest second

        # Process only one frame per second
        if current_second > last_processed_second:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Filter detected objects based on confidence threshold
            detected_objects = []
            for box in results[0].boxes:
                if box.conf >= confidence_threshold:  # Only include objects with confidence >= 0.8
                    detected_objects.append(model.names[int(box.cls)])
            
            # Only process frames where objects with confidence >= threshold are detected
            if detected_objects:  # If there are high-confidence detected objects
                
                # Plot bounding boxes and labels on the image
                annotated_frame = results[0].plot()  # Plot detection results on the frame
                
                # Save the annotated image
                frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
                cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
                image_paths.append(frame_filename)
                
                # Categorize the detected objects into activities
                activity_summary = categorize_activity(detected_objects)
                
                # Store the activities with their timestamp
                for activity, objects in activity_summary.items():
                    journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
            
            last_processed_second = current_second  # Update the last processed second
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries, image_paths  


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    pil_images = []
    for image_path in image_paths:
        # Read the image using OpenCV
        image = cv2.imread(image_path)
        # Convert the image from BGR (OpenCV) to RGB (PIL)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_rgb)
        pil_images.append(pil_image)

    infograhic_html=''
    
    if len(pil_images) >= 2: #just for mockup
        first_frame_detail=generate_image_desc(pil_images[0]) 
        infograhic_html=generate_journal_infographics(first_frame_detail)
    

    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths,infograhic_html


with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames")
    run_button = gr.Button("Generate Journal")
    infographic_html=gr.HTML()
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])

iface.launch()