File size: 10,391 Bytes
100b9b2
 
109fee8
100b9b2
d48f382
 
100b9b2
b7b43b2
 
 
80cf856
b7b43b2
80cf856
100b9b2
01e83c8
9c69830
 
100b9b2
b7b43b2
 
 
 
a516023
b7b43b2
 
 
 
 
 
 
80cf856
a516023
80cf856
 
 
e71539b
b7b43b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e35554
b7b43b2
 
 
 
 
 
 
 
 
 
a516023
b7b43b2
 
 
 
 
 
 
 
80cf856
3e35554
 
80cf856
 
e71539b
80cf856
 
 
 
 
 
 
 
3d3237e
80cf856
 
 
 
 
 
 
d48f382
100b9b2
 
 
 
a9777c0
d48f382
100b9b2
 
 
 
d48f382
100b9b2
 
 
d48f382
 
 
100b9b2
d48f382
100b9b2
8a4dc7e
 
 
 
 
 
 
 
 
d48f382
100b9b2
15b92fc
d48f382
47f97bd
84def21
d48f382
 
 
6437c14
 
100b9b2
 
 
 
 
 
6437c14
 
 
 
 
 
fd362dd
 
 
 
 
a9777c0
 
 
 
 
fd362dd
a9777c0
 
6fb0ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
6437c14
fd362dd
6437c14
d48f382
 
100b9b2
 
 
6437c14
100b9b2
d5b8a83
d48f382
d5b8a83
3e35554
 
 
 
 
 
 
 
 
 
 
 
 
 
5733ca6
d48f382
84def21
d5b8a83
3e35554
47f97bd
84def21
d48f382
d5b8a83
 
da6e971
d48f382
3e35554
d48f382
3e35554
d5b8a83
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)



model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model_vision = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)


SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content"

def extract_assistant_reply(input_string):
    # Define the tag that indicates the start of the assistant's reply
    start_tag = "<|start_header_id|>assistant<|end_header_id|>"
    # Find the position where the assistant's reply starts
    start_index = input_string.find(start_tag)
    if start_index == -1:
        return "Assistant's reply not found."
    start_index += len(start_tag)
    # Extract everything after the start tag
    assistant_reply = input_string[start_index:].strip()
    return assistant_reply

def extract_json_from_markdown(markdown_text):
    try:
        start_idx = markdown_text.find('```')
        end_idx = markdown_text.find('```', start_idx + 3)
        
       
        if markdown_text[start_idx:start_idx + 7] == '```html':
            start_idx += len('```html')
        else:
            start_idx += len('```')

        # Extract and clean up the code block (json or not)
        json_str = markdown_text[start_idx:end_idx].strip()

        # Try to load it as JSON
        return json.loads(json_str)
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return None



@spaces.GPU
def generate_image_desc(image):
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": SYSTEM_INSTRUCTION}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)

    # Generate the output from the model
    output = model_vision.generate(**inputs, max_new_tokens=300)
    print(output)
    markdown_text = processor.decode(output[0])
    print(markdown_text)
        
    markdown_text=extract_assistant_reply(markdown_text)
    html_output = markdown.markdown(markdown_text)
    return html_output

@spaces.GPU
def generate_journal_infographics(journal):
    prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
    
    messages = [
        {"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."},
        {"role": "user", "content": prompt}
    ]
    
    # Prepare inputs for the model
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generate the documentation
    generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(documentation)
    return documentation



# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    image_paths = []
    frame_count = 0
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images

    last_processed_second = -1  # Keep track of the last processed second
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp in the video
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
        current_second = int(current_time)  # Round down to the nearest second

        # Process only one frame per second
        if current_second > last_processed_second:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Filter detected objects based on confidence threshold
            detected_objects = []
            for box in results[0].boxes:
                if box.conf >= confidence_threshold:  # Only include objects with confidence >= 0.8
                    detected_objects.append(model.names[int(box.cls)])
            
            # Only process frames where objects with confidence >= threshold are detected
            if detected_objects:  # If there are high-confidence detected objects
                
                # Plot bounding boxes and labels on the image
                annotated_frame = results[0].plot()  # Plot detection results on the frame
                
                # Save the annotated image
                frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
                cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
                image_paths.append(frame_filename)
                
                # Categorize the detected objects into activities
                activity_summary = categorize_activity(detected_objects)
                
                # Store the activities with their timestamp
                for activity, objects in activity_summary.items():
                    journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
            
            last_processed_second = current_second  # Update the last processed second
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries, image_paths  


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    pil_images = []
    for image_path in image_paths:
        # Read the image using OpenCV
        image = cv2.imread(image_path)
        # Convert the image from BGR (OpenCV) to RGB (PIL)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_rgb)
        pil_images.append(pil_image)

    infograhic_html=''
    
    if len(pil_images) >= 2: #just for mockup
        first_frame_detail=generate_image_desc(pil_images[0]) 
        infograhic_html=generate_journal_infographics(first_frame_detail)
    

    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths,infograhic_html


with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames")
    run_button = gr.Button("Generate Journal")
    infographic_html=gr.HTML()
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])

iface.launch()