DailySnap / app.py
yasserrmd's picture
Update app.py
e71539b verified
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
import markdown
import requests
import torch
import io
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_vision = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
SYSTEM_INSTRUCTION="You are DailySnap, an intelligent assistant tasked with analyzing images and generating a visually appealing daily journal entry based on the content of the image. Your job is to examine the given image, identify key elements, such as objects, people, and emotions, and then create a narrative that reflects the activities or events captured. The journal should include random timestamps (e.g., 7:15 AM, 1:30 PM) to create a chronological flow, giving the narrative a natural, realistic feel. The final output should be engaging and coherent, combining image analysis with descriptive storytelling to provide users with a meaningful daily journal based on the visual content"
def extract_assistant_reply(input_string):
# Define the tag that indicates the start of the assistant's reply
start_tag = "<|start_header_id|>assistant<|end_header_id|>"
# Find the position where the assistant's reply starts
start_index = input_string.find(start_tag)
if start_index == -1:
return "Assistant's reply not found."
start_index += len(start_tag)
# Extract everything after the start tag
assistant_reply = input_string[start_index:].strip()
return assistant_reply
def extract_json_from_markdown(markdown_text):
try:
start_idx = markdown_text.find('```')
end_idx = markdown_text.find('```', start_idx + 3)
if markdown_text[start_idx:start_idx + 7] == '```html':
start_idx += len('```html')
else:
start_idx += len('```')
# Extract and clean up the code block (json or not)
json_str = markdown_text[start_idx:end_idx].strip()
# Try to load it as JSON
return json.loads(json_str)
except Exception as e:
print(f"Error extracting JSON: {e}")
return None
@spaces.GPU
def generate_image_desc(image):
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": SYSTEM_INSTRUCTION}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, return_tensors="pt").to(model.device)
# Generate the output from the model
output = model_vision.generate(**inputs, max_new_tokens=300)
print(output)
markdown_text = processor.decode(output[0])
print(markdown_text)
markdown_text=extract_assistant_reply(markdown_text)
html_output = markdown.markdown(markdown_text)
return html_output
@spaces.GPU
def generate_journal_infographics(journal):
prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"
messages = [
{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate visually appealing daily journals and infographics. Your primary function is to transform user-provided details into structured and aesthetically engaging content. When generating infographics, you must use HTML and Bootstrap icons to create visually compelling and clear representations of the user's data or ideas. For daily journals, you should organize the information into an appealing, easy-to-read format that incorporates icons, headings, and layouts based on the user’s preferences. Your designs should always focus on clarity, creativity, and user-centric formatting, ensuring the final product is both functional and visually engaging. Your ultimate goal is to help users effortlessly convert their daily activities and narratives into attractive visual content with minimal guidance."},
{"role": "user", "content": prompt}
]
# Prepare inputs for the model
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate the documentation
generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(documentation)
return documentation
# Define activity categories based on detected objects
activity_categories = {
"Working": ["laptop", "computer", "keyboard", "office chair"],
"Meal Time": ["fork", "spoon", "plate", "food"],
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
# Add more categories and objects as needed
}
# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
categorized_activities = {}
for activity, objects in activity_categories.items():
if any(obj in detected_objects for obj in objects):
if activity not in categorized_activities:
categorized_activities[activity] = []
categorized_activities[activity].append(detected_objects)
return categorized_activities
# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
score, _ = ssim(gray_frame1, gray_frame2, full=True)
return score < threshold
# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
cap = cv2.VideoCapture(video_path)
journal_entries = []
image_paths = []
frame_count = 0
output_folder = "detected_frames"
os.makedirs(output_folder, exist_ok=True) # Create folder to store images
last_processed_second = -1 # Keep track of the last processed second
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Get the current timestamp in the video
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
current_second = int(current_time) # Round down to the nearest second
# Process only one frame per second
if current_second > last_processed_second:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Make predictions using YOLOv10 on the current frame
results = model.predict(source=frame_rgb, device=device)
# Filter detected objects based on confidence threshold
detected_objects = []
for box in results[0].boxes:
if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
detected_objects.append(model.names[int(box.cls)])
# Only process frames where objects with confidence >= threshold are detected
if detected_objects: # If there are high-confidence detected objects
# Plot bounding boxes and labels on the image
annotated_frame = results[0].plot() # Plot detection results on the frame
# Save the annotated image
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
image_paths.append(frame_filename)
# Categorize the detected objects into activities
activity_summary = categorize_activity(detected_objects)
# Store the activities with their timestamp
for activity, objects in activity_summary.items():
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
last_processed_second = current_second # Update the last processed second
frame_count += 1
cap.release()
return journal_entries, image_paths
def display_journal_with_images(video):
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
pil_images = []
for image_path in image_paths:
# Read the image using OpenCV
image = cv2.imread(image_path)
# Convert the image from BGR (OpenCV) to RGB (PIL)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Convert the NumPy array to a PIL image
pil_image = Image.fromarray(image_rgb)
pil_images.append(pil_image)
infograhic_html=''
if len(pil_images) >= 2: #just for mockup
first_frame_detail=generate_image_desc(pil_images[0])
infograhic_html=generate_journal_infographics(first_frame_detail)
journal_text = "\n".join(journal_entries)
return journal_text, image_paths,infograhic_html
with gr.Blocks() as iface:
video_input = gr.Video(label="Upload Video", height=300)
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
image_gallery = gr.Gallery(label="Annotated Frames")
run_button = gr.Button("Generate Journal")
infographic_html=gr.HTML()
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])
iface.launch()