Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -18,6 +18,10 @@ from PIL import Image
|
|
18 |
import edge_tts
|
19 |
import trimesh
|
20 |
|
|
|
|
|
|
|
|
|
21 |
from transformers import (
|
22 |
AutoModelForCausalLM,
|
23 |
AutoTokenizer,
|
@@ -400,7 +404,27 @@ def generate_3d_fn(
|
|
400 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
401 |
return glb_path, seed
|
402 |
|
403 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
@spaces.GPU
|
406 |
def generate(
|
@@ -419,6 +443,7 @@ def generate(
|
|
419 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
420 |
- "@web": triggers a web search or webpage visit.
|
421 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
|
|
422 |
"""
|
423 |
text = input_dict["text"]
|
424 |
files = input_dict.get("files", [])
|
@@ -493,6 +518,27 @@ def generate(
|
|
493 |
yield partial
|
494 |
return
|
495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
# --- Text and TTS branch ---
|
497 |
tts_prefix = "@tts"
|
498 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
@@ -583,18 +629,17 @@ demo = gr.ChatInterface(
|
|
583 |
examples=[
|
584 |
["@tts2 What causes rainbows to form?"],
|
585 |
["@3d A birthday cupcake with cherry"],
|
586 |
-
[{"text": "
|
587 |
-
["@image Chocolate dripping from a donut
|
588 |
["@rAgent Explain how a binary search algorithm works."],
|
589 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
590 |
-
|
591 |
],
|
592 |
cache_examples=False,
|
593 |
type="messages",
|
594 |
description=DESCRIPTION,
|
595 |
css=css,
|
596 |
fill_height=True,
|
597 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, default-{text gen}{image-text-text}"),
|
598 |
stop_btn="Stop Generation",
|
599 |
multimodal=True,
|
600 |
)
|
|
|
18 |
import edge_tts
|
19 |
import trimesh
|
20 |
|
21 |
+
import supervision as sv
|
22 |
+
from ultralytics import YOLO as YOLODetector
|
23 |
+
from huggingface_hub import hf_hub_download
|
24 |
+
|
25 |
from transformers import (
|
26 |
AutoModelForCausalLM,
|
27 |
AutoTokenizer,
|
|
|
404 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
405 |
return glb_path, seed
|
406 |
|
407 |
+
# YOLO Object Detection Setup
|
408 |
+
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
409 |
+
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
410 |
+
yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
|
411 |
+
yolo_detector = YOLODetector(yolo_model_path)
|
412 |
+
|
413 |
+
def detect_objects(image: np.ndarray):
|
414 |
+
"""Runs object detection on the input image."""
|
415 |
+
results = yolo_detector(image, verbose=False)[0]
|
416 |
+
detections = sv.Detections.from_ultralytics(results).with_nms()
|
417 |
+
|
418 |
+
box_annotator = sv.BoxAnnotator()
|
419 |
+
label_annotator = sv.LabelAnnotator()
|
420 |
+
|
421 |
+
annotated_image = image.copy()
|
422 |
+
annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
|
423 |
+
annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
|
424 |
+
|
425 |
+
return Image.fromarray(annotated_image)
|
426 |
+
|
427 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
|
428 |
|
429 |
@spaces.GPU
|
430 |
def generate(
|
|
|
443 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
444 |
- "@web": triggers a web search or webpage visit.
|
445 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
446 |
+
- "@yolo": triggers object detection using YOLO.
|
447 |
"""
|
448 |
text = input_dict["text"]
|
449 |
files = input_dict.get("files", [])
|
|
|
518 |
yield partial
|
519 |
return
|
520 |
|
521 |
+
# --- YOLO Object Detection branch ---
|
522 |
+
if text.strip().lower().startswith("@yolo"):
|
523 |
+
yield "🔍 Running object detection with YOLO..."
|
524 |
+
if not files or len(files) == 0:
|
525 |
+
yield "Error: Please attach an image for YOLO object detection."
|
526 |
+
return
|
527 |
+
# Use the first attached image
|
528 |
+
input_file = files[0]
|
529 |
+
try:
|
530 |
+
if isinstance(input_file, str):
|
531 |
+
pil_image = Image.open(input_file)
|
532 |
+
else:
|
533 |
+
pil_image = input_file
|
534 |
+
except Exception as e:
|
535 |
+
yield f"Error loading image: {str(e)}"
|
536 |
+
return
|
537 |
+
np_image = np.array(pil_image)
|
538 |
+
result_img = detect_objects(np_image)
|
539 |
+
yield gr.Image(result_img)
|
540 |
+
return
|
541 |
+
|
542 |
# --- Text and TTS branch ---
|
543 |
tts_prefix = "@tts"
|
544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
629 |
examples=[
|
630 |
["@tts2 What causes rainbows to form?"],
|
631 |
["@3d A birthday cupcake with cherry"],
|
632 |
+
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
633 |
+
["@image Chocolate dripping from a donut"],
|
634 |
["@rAgent Explain how a binary search algorithm works."],
|
635 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
|
|
636 |
],
|
637 |
cache_examples=False,
|
638 |
type="messages",
|
639 |
description=DESCRIPTION,
|
640 |
css=css,
|
641 |
fill_height=True,
|
642 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
|
643 |
stop_btn="Stop Generation",
|
644 |
multimodal=True,
|
645 |
)
|