yasserrmd commited on
Commit
b7b43b2
1 Parent(s): e89f20e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py CHANGED
@@ -5,11 +5,85 @@ import cv2
5
  import torch
6
  import os
7
  import spaces
 
 
 
 
 
8
 
9
 
10
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Define activity categories based on detected objects
14
  activity_categories = {
15
  "Working": ["laptop", "computer", "keyboard", "office chair"],
 
5
  import torch
6
  import os
7
  import spaces
8
+ import markdown
9
+ import requests
10
+ import torch
11
+ from PIL import Image
12
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
13
 
14
 
15
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
  model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
17
 
18
+
19
+
20
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
21
+
22
+ model = MllamaForConditionalGeneration.from_pretrained(
23
+ model_id,
24
+ torch_dtype=torch.bfloat16,
25
+ device_map="auto",
26
+ )
27
+ processor = AutoProcessor.from_pretrained(model_id)
28
+
29
+
30
+ SYSTEM_INSTRUCTION="You are DailySnap, your job is to anlyse the given image and provide daily journal about the image and use some random time"
31
+
32
+ def extract_assistant_reply(input_string):
33
+ # Define the tag that indicates the start of the assistant's reply
34
+ start_tag = "<|start_header_id|>assistant<|end_header_id|>"
35
+ # Find the position where the assistant's reply starts
36
+ start_index = input_string.find(start_tag)
37
+ if start_index == -1:
38
+ return "Assistant's reply not found."
39
+ start_index += len(start_tag)
40
+ # Extract everything after the start tag
41
+ assistant_reply = input_string[start_index:].strip()
42
+ return assistant_reply
43
+
44
+ def extract_json_from_markdown(markdown_text):
45
+ try:
46
+ start_idx = markdown_text.find('```')
47
+ end_idx = markdown_text.find('```', start_idx + 3)
48
+
49
+
50
+ if markdown_text[start_idx:start_idx + 7] == '```html':
51
+ start_idx += len('```html')
52
+ else:
53
+ start_idx += len('```')
54
+
55
+ # Extract and clean up the code block (json or not)
56
+ json_str = markdown_text[start_idx:end_idx].strip()
57
+
58
+ # Try to load it as JSON
59
+ return json.loads(json_str)
60
+ except Exception as e:
61
+ print(f"Error extracting JSON: {e}")
62
+ return None
63
+
64
+
65
+
66
+ @spaces.GPU
67
+ def generate__image_desc(image):
68
+ messages = [
69
+ {"role": "user", "content": [
70
+ {"type": "image"},
71
+ {"type": "text", "text": SYSTEM_INSTRUCTION}
72
+ ]}
73
+ ]
74
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
75
+ inputs = processor(image, input_text, return_tensors="pt").to(model.device)
76
+
77
+ # Generate the output from the model
78
+ output = model.generate(**inputs, max_new_tokens=300)
79
+ print(output)
80
+ markdown_text = processor.decode(output[0])
81
+ print(markdown_text)
82
+
83
+ markdown_text=extract_assistant_reply(markdown_text)
84
+ html_output = markdown.markdown(markdown_text)
85
+ return html_output
86
+
87
  # Define activity categories based on detected objects
88
  activity_categories = {
89
  "Working": ["laptop", "computer", "keyboard", "office chair"],