Spaces:

yasserrmd
/

DailySnap

Running on Zero

App Files Files Community

DailySnap / app.py

yasserrmd

Update app.py

a516023 verified 5 days ago

raw

history blame

9.24 kB

	import gradio as gr
	from ultralytics import YOLOv10
	from skimage.metrics import structural_similarity as ssim
	import cv2
	import torch
	import os
	import spaces
	import markdown
	import requests
	import torch
	import io
	from PIL import Image
	from transformers import MllamaForConditionalGeneration, AutoProcessor,AutoModelForCausalLM, AutoTokenizer


	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)



	model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

	model_vision = MllamaForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_id)


	model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
	model_code = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_name)


	SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time"

	def extract_assistant_reply(input_string):
	# Define the tag that indicates the start of the assistant's reply
	start_tag = "<\|start_header_id\|>assistant<\|end_header_id\|>"
	# Find the position where the assistant's reply starts
	start_index = input_string.find(start_tag)
	if start_index == -1:
	return "Assistant's reply not found."
	start_index += len(start_tag)
	# Extract everything after the start tag
	assistant_reply = input_string[start_index:].strip()
	return assistant_reply

	def extract_json_from_markdown(markdown_text):
	try:
	start_idx = markdown_text.find('```')
	end_idx = markdown_text.find('```', start_idx + 3)


	if markdown_text[start_idx:start_idx + 7] == '```html':
	start_idx += len('```html')
	else:
	start_idx += len('```')

	# Extract and clean up the code block (json or not)
	json_str = markdown_text[start_idx:end_idx].strip()

	# Try to load it as JSON
	return json.loads(json_str)
	except Exception as e:
	print(f"Error extracting JSON: {e}")
	return None



	@spaces.GPU
	def generate_image_desc(image):
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": SYSTEM_INSTRUCTION}
	]}
	]
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)

	# Generate the output from the model
	output = model_vision.generate(**inputs, max_new_tokens=300)
	print(output)
	markdown_text = processor.decode(output[0])
	print(markdown_text)

	markdown_text=extract_assistant_reply(markdown_text)
	html_output = markdown.markdown(markdown_text)
	return html_output

	@spaces.GPU
	def generate_journal_infographics(journal):
	prompt = f"Generate daily journal inforgraphics using html for the following:\n\n{journal}"

	messages = [
	{"role": "system", "content": "You are DailySnap, a highly efficient and intelligent assistant designed to generate infographics using htmnl bootstrap icon and generate highly appealing daily journal as per the user detail"},
	{"role": "user", "content": prompt}
	]

	# Prepare inputs for the model
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Generate the documentation
	generated_ids = model_code.generate(**model_inputs, max_new_tokens=4000)
	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
	documentation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	print(documentation)
	return documentation



	# Define activity categories based on detected objects
	activity_categories = {
	"Working": ["laptop", "computer", "keyboard", "office chair"],
	"Meal Time": ["fork", "spoon", "plate", "food"],
	"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
	"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
	# Add more categories and objects as needed
	}

	# Function to map detected objects to categorized activities
	def categorize_activity(detected_objects):
	categorized_activities = {}

	for activity, objects in activity_categories.items():
	if any(obj in detected_objects for obj in objects):
	if activity not in categorized_activities:
	categorized_activities[activity] = []
	categorized_activities[activity].append(detected_objects)

	return categorized_activities


	# Function to compare frames using SSIM to avoid repeated frames
	def is_frame_different(frame1, frame2, threshold=0.9):
	gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
	gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
	score, _ = ssim(gray_frame1, gray_frame2, full=True)
	return score < threshold


	# Function to process the video, detect objects, and generate a categorized journal with images
	@spaces.GPU
	def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
	cap = cv2.VideoCapture(video_path)
	journal_entries = []
	image_paths = []
	frame_count = 0
	output_folder = "detected_frames"
	os.makedirs(output_folder, exist_ok=True) # Create folder to store images

	last_processed_second = -1 # Keep track of the last processed second

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	# Get the current timestamp in the video
	current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
	current_second = int(current_time) # Round down to the nearest second

	# Process only one frame per second
	if current_second > last_processed_second:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

	# Make predictions using YOLOv10 on the current frame
	results = model.predict(source=frame_rgb, device=device)

	# Filter detected objects based on confidence threshold
	detected_objects = []
	for box in results[0].boxes:
	if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
	detected_objects.append(model.names[int(box.cls)])

	# Only process frames where objects with confidence >= threshold are detected
	if detected_objects: # If there are high-confidence detected objects

	# Plot bounding boxes and labels on the image
	annotated_frame = results[0].plot() # Plot detection results on the frame

	# Save the annotated image
	frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
	cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
	image_paths.append(frame_filename)

	# Categorize the detected objects into activities
	activity_summary = categorize_activity(detected_objects)

	# Store the activities with their timestamp
	for activity, objects in activity_summary.items():
	journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")

	last_processed_second = current_second # Update the last processed second

	frame_count += 1

	cap.release()

	return journal_entries, image_paths


	def display_journal_with_images(video):
	journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
	pil_images = []
	for image_path in image_paths:
	# Read the image using OpenCV
	image = cv2.imread(image_path)
	# Convert the image from BGR (OpenCV) to RGB (PIL)
	image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	# Convert the NumPy array to a PIL image
	pil_image = Image.fromarray(image_rgb)
	pil_images.append(pil_image)

	infograhic_html=''

	if len(pil_images) >= 2: #just for mockup
	first_frame_detail=generate_image_desc(pil_images[0])
	second_frame_detail=generate_image_desc(pil_images[0])
	infograhic_html=generate_journal_infographics(first_frame_detail + "\n\n" + second_frame_detail)


	journal_text = "\n".join(journal_entries)
	return journal_text, image_paths,infograhic_html


	with gr.Blocks() as iface:
	video_input = gr.Video(label="Upload Video", height=300)
	journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
	image_gallery = gr.Gallery(label="Annotated Frames")
	run_button = gr.Button("Generate Journal")
	infographic_html=gr.HTML()

	run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery,infographic_html])

	iface.launch()