PHI4-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 11 days ago

Commit

e84f6e6

verified ·

1 Parent(s): d5c677b

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -5

app.py CHANGED Viewed

@@ -18,6 +18,10 @@ from PIL import Image
 import edge_tts
 import trimesh
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -400,7 +404,27 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# Chat Generation Function with support for @tts, @image, @3d, @web, and @rAgent commands
 @spaces.GPU
 def generate(
@@ -419,6 +443,7 @@ def generate(
       - "@3d": triggers 3D model generation using the ShapE pipeline.
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -493,6 +518,27 @@ def generate(
             yield partial
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -583,18 +629,17 @@ demo = gr.ChatInterface(
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@3d A birthday cupcake with cherry"],
-        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-        ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import edge_tts
 import trimesh
+import supervision as sv
+from ultralytics import YOLO as YOLODetector
+from huggingface_hub import hf_hub_download
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
+# YOLO Object Detection Setup
+YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
+YOLO_CHECKPOINT_NAME = "images/demo.pt"
+yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
+yolo_detector = YOLODetector(yolo_model_path)
+def detect_objects(image: np.ndarray):
+    """Runs object detection on the input image."""
+    results = yolo_detector(image, verbose=False)[0]
+    detections = sv.Detections.from_ultralytics(results).with_nms()
+    box_annotator = sv.BoxAnnotator()
+    label_annotator = sv.LabelAnnotator()
+    annotated_image = image.copy()
+    annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
+    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
+    return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 @spaces.GPU
 def generate(
       - "@3d": triggers 3D model generation using the ShapE pipeline.
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
+      - "@yolo": triggers object detection using YOLO.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             yield partial
         return
+    # --- YOLO Object Detection branch ---
+    if text.strip().lower().startswith("@yolo"):
+        yield "🔍 Running object detection with YOLO..."
+        if not files or len(files) == 0:
+            yield "Error: Please attach an image for YOLO object detection."
+            return
+        # Use the first attached image
+        input_file = files[0]
+        try:
+            if isinstance(input_file, str):
+                pil_image = Image.open(input_file)
+            else:
+                pil_image = input_file
+        except Exception as e:
+            yield f"Error loading image: {str(e)}"
+            return
+        np_image = np.array(pil_image)
+        result_img = detect_objects(np_image)
+        yield gr.Image(result_img)
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@3d A birthday cupcake with cherry"],
+        [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
+        ["@image Chocolate dripping from a donut"],
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )