prithivMLmods commited on
Commit
e84f6e6
·
verified ·
1 Parent(s): d5c677b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -5
app.py CHANGED
@@ -18,6 +18,10 @@ from PIL import Image
18
  import edge_tts
19
  import trimesh
20
 
 
 
 
 
21
  from transformers import (
22
  AutoModelForCausalLM,
23
  AutoTokenizer,
@@ -400,7 +404,27 @@ def generate_3d_fn(
400
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
401
  return glb_path, seed
402
 
403
- # Chat Generation Function with support for @tts, @image, @3d, @web, and @rAgent commands
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  @spaces.GPU
406
  def generate(
@@ -419,6 +443,7 @@ def generate(
419
  - "@3d": triggers 3D model generation using the ShapE pipeline.
420
  - "@web": triggers a web search or webpage visit.
421
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
 
422
  """
423
  text = input_dict["text"]
424
  files = input_dict.get("files", [])
@@ -493,6 +518,27 @@ def generate(
493
  yield partial
494
  return
495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  # --- Text and TTS branch ---
497
  tts_prefix = "@tts"
498
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -583,18 +629,17 @@ demo = gr.ChatInterface(
583
  examples=[
584
  ["@tts2 What causes rainbows to form?"],
585
  ["@3d A birthday cupcake with cherry"],
586
- [{"text": "summarize the letter", "files": ["examples/1.png"]}],
587
- ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
588
  ["@rAgent Explain how a binary search algorithm works."],
589
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
590
-
591
  ],
592
  cache_examples=False,
593
  type="messages",
594
  description=DESCRIPTION,
595
  css=css,
596
  fill_height=True,
597
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, default-{text gen}{image-text-text}"),
598
  stop_btn="Stop Generation",
599
  multimodal=True,
600
  )
 
18
  import edge_tts
19
  import trimesh
20
 
21
+ import supervision as sv
22
+ from ultralytics import YOLO as YOLODetector
23
+ from huggingface_hub import hf_hub_download
24
+
25
  from transformers import (
26
  AutoModelForCausalLM,
27
  AutoTokenizer,
 
404
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
405
  return glb_path, seed
406
 
407
+ # YOLO Object Detection Setup
408
+ YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
409
+ YOLO_CHECKPOINT_NAME = "images/demo.pt"
410
+ yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
411
+ yolo_detector = YOLODetector(yolo_model_path)
412
+
413
+ def detect_objects(image: np.ndarray):
414
+ """Runs object detection on the input image."""
415
+ results = yolo_detector(image, verbose=False)[0]
416
+ detections = sv.Detections.from_ultralytics(results).with_nms()
417
+
418
+ box_annotator = sv.BoxAnnotator()
419
+ label_annotator = sv.LabelAnnotator()
420
+
421
+ annotated_image = image.copy()
422
+ annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
423
+ annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
424
+
425
+ return Image.fromarray(annotated_image)
426
+
427
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
428
 
429
  @spaces.GPU
430
  def generate(
 
443
  - "@3d": triggers 3D model generation using the ShapE pipeline.
444
  - "@web": triggers a web search or webpage visit.
445
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
446
+ - "@yolo": triggers object detection using YOLO.
447
  """
448
  text = input_dict["text"]
449
  files = input_dict.get("files", [])
 
518
  yield partial
519
  return
520
 
521
+ # --- YOLO Object Detection branch ---
522
+ if text.strip().lower().startswith("@yolo"):
523
+ yield "🔍 Running object detection with YOLO..."
524
+ if not files or len(files) == 0:
525
+ yield "Error: Please attach an image for YOLO object detection."
526
+ return
527
+ # Use the first attached image
528
+ input_file = files[0]
529
+ try:
530
+ if isinstance(input_file, str):
531
+ pil_image = Image.open(input_file)
532
+ else:
533
+ pil_image = input_file
534
+ except Exception as e:
535
+ yield f"Error loading image: {str(e)}"
536
+ return
537
+ np_image = np.array(pil_image)
538
+ result_img = detect_objects(np_image)
539
+ yield gr.Image(result_img)
540
+ return
541
+
542
  # --- Text and TTS branch ---
543
  tts_prefix = "@tts"
544
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
 
629
  examples=[
630
  ["@tts2 What causes rainbows to form?"],
631
  ["@3d A birthday cupcake with cherry"],
632
+ [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
633
+ ["@image Chocolate dripping from a donut"],
634
  ["@rAgent Explain how a binary search algorithm works."],
635
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
 
636
  ],
637
  cache_examples=False,
638
  type="messages",
639
  description=DESCRIPTION,
640
  css=css,
641
  fill_height=True,
642
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
643
  stop_btn="Stop Generation",
644
  multimodal=True,
645
  )