Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -36,8 +36,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
|
36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
39 |
-
os.system('pip install backoff')
|
40 |
-
|
41 |
# Global constants and helper functions
|
42 |
|
43 |
MAX_SEED = np.iinfo(np.int32).max
|
@@ -259,7 +257,15 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
|
|
259 |
# ------------------------------------------------------------------------------
|
260 |
|
261 |
DESCRIPTION = """
|
262 |
-
# Agent Dino 🌠
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
"""
|
264 |
|
265 |
css = '''
|
@@ -469,7 +475,7 @@ def generate(
|
|
469 |
- "@web": triggers a web search or webpage visit.
|
470 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
471 |
- "@yolo": triggers object detection using YOLO.
|
472 |
-
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model
|
473 |
"""
|
474 |
text = input_dict["text"]
|
475 |
files = input_dict.get("files", [])
|
@@ -565,7 +571,7 @@ def generate(
|
|
565 |
yield gr.Image(result_img)
|
566 |
return
|
567 |
|
568 |
-
# --- Phi-4 Multimodal branch (Image/Audio)
|
569 |
if text.strip().lower().startswith("@phi4"):
|
570 |
question = text[len("@phi4"):].strip()
|
571 |
if not files:
|
@@ -574,15 +580,14 @@ def generate(
|
|
574 |
if not question:
|
575 |
yield "Error: Please provide a question after @phi4."
|
576 |
return
|
|
|
577 |
# Determine input type (Image or Audio) from the first file
|
578 |
input_file = files[0]
|
579 |
try:
|
580 |
-
# If file is already a PIL Image, treat as image
|
581 |
if isinstance(input_file, Image.Image):
|
582 |
input_type = "Image"
|
583 |
file_for_phi4 = input_file
|
584 |
else:
|
585 |
-
# Try opening as image; if it fails, assume audio
|
586 |
try:
|
587 |
file_for_phi4 = Image.open(input_file)
|
588 |
input_type = "Image"
|
@@ -592,7 +597,7 @@ def generate(
|
|
592 |
except Exception:
|
593 |
input_type = "Audio"
|
594 |
file_for_phi4 = input_file
|
595 |
-
|
596 |
if input_type == "Image":
|
597 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
598 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
@@ -603,19 +608,22 @@ def generate(
|
|
603 |
else:
|
604 |
yield "Invalid file type for @phi4 multimodal processing."
|
605 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
|
607 |
-
# Set up a streamer for the phi4 model
|
608 |
-
streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
|
609 |
-
generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
|
610 |
-
thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
|
611 |
-
thread_phi4.start()
|
612 |
-
|
613 |
-
outputs_phi4 = []
|
614 |
-
yield "🤔 Thinking..."
|
615 |
-
for new_text in streamer_phi4:
|
616 |
-
outputs_phi4.append(new_text)
|
617 |
-
yield "".join(outputs_phi4)
|
618 |
-
return
|
619 |
|
620 |
# --- Text and TTS branch ---
|
621 |
tts_prefix = "@tts"
|
@@ -705,16 +713,15 @@ demo = gr.ChatInterface(
|
|
705 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
706 |
],
|
707 |
examples=[
|
708 |
-
[{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
|
709 |
-
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
710 |
["@tts2 What causes rainbows to form?"],
|
711 |
["@image Chocolate dripping from a donut"],
|
712 |
["@3d A birthday cupcake with cherry"],
|
713 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
714 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
715 |
-
["@
|
716 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
717 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
718 |
],
|
719 |
cache_examples=False,
|
720 |
type="messages",
|
|
|
36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
|
|
|
|
39 |
# Global constants and helper functions
|
40 |
|
41 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
257 |
# ------------------------------------------------------------------------------
|
258 |
|
259 |
DESCRIPTION = """
|
260 |
+
# Agent Dino 🌠
|
261 |
+
This chatbot supports various commands:
|
262 |
+
- **@tts1 / @tts2:** text-to-speech
|
263 |
+
- **@image:** image generation
|
264 |
+
- **@3d:** 3D mesh generation
|
265 |
+
- **@web:** web search/visit
|
266 |
+
- **@rAgent:** reasoning chain
|
267 |
+
- **@yolo:** object detection
|
268 |
+
- **@phi4:** multimodal (image/audio) question answering
|
269 |
"""
|
270 |
|
271 |
css = '''
|
|
|
475 |
- "@web": triggers a web search or webpage visit.
|
476 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
477 |
- "@yolo": triggers object detection using YOLO.
|
478 |
+
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
|
479 |
"""
|
480 |
text = input_dict["text"]
|
481 |
files = input_dict.get("files", [])
|
|
|
571 |
yield gr.Image(result_img)
|
572 |
return
|
573 |
|
574 |
+
# --- Phi-4 Multimodal branch (Image/Audio) ---
|
575 |
if text.strip().lower().startswith("@phi4"):
|
576 |
question = text[len("@phi4"):].strip()
|
577 |
if not files:
|
|
|
580 |
if not question:
|
581 |
yield "Error: Please provide a question after @phi4."
|
582 |
return
|
583 |
+
|
584 |
# Determine input type (Image or Audio) from the first file
|
585 |
input_file = files[0]
|
586 |
try:
|
|
|
587 |
if isinstance(input_file, Image.Image):
|
588 |
input_type = "Image"
|
589 |
file_for_phi4 = input_file
|
590 |
else:
|
|
|
591 |
try:
|
592 |
file_for_phi4 = Image.open(input_file)
|
593 |
input_type = "Image"
|
|
|
597 |
except Exception:
|
598 |
input_type = "Audio"
|
599 |
file_for_phi4 = input_file
|
600 |
+
|
601 |
if input_type == "Image":
|
602 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
603 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
|
608 |
else:
|
609 |
yield "Invalid file type for @phi4 multimodal processing."
|
610 |
return
|
611 |
+
|
612 |
+
with torch.no_grad():
|
613 |
+
generate_ids = phi4_model.generate(
|
614 |
+
**inputs,
|
615 |
+
max_new_tokens=200,
|
616 |
+
num_logits_to_keep=0,
|
617 |
+
streamer=streamer # Adding text streamer
|
618 |
+
)
|
619 |
+
|
620 |
+
buffer = "⚛️ phi4 multimodal is initiated, hold tight"
|
621 |
+
for new_text in streamer:
|
622 |
+
buffer += new_text
|
623 |
+
buffer = buffer.replace("<|im_end|>", "")
|
624 |
+
time.sleep(0.01)
|
625 |
+
yield buffer
|
626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
|
628 |
# --- Text and TTS branch ---
|
629 |
tts_prefix = "@tts"
|
|
|
713 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
714 |
],
|
715 |
examples=[
|
|
|
|
|
716 |
["@tts2 What causes rainbows to form?"],
|
717 |
["@image Chocolate dripping from a donut"],
|
718 |
["@3d A birthday cupcake with cherry"],
|
719 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
720 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
721 |
+
["@rAgent Explain how a binary search algorithm works."],
|
722 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
723 |
["@tts1 Explain Tower of Hanoi"],
|
724 |
+
["@phi4 What is depicted in this image?"], # Example for @phi4
|
725 |
],
|
726 |
cache_examples=False,
|
727 |
type="messages",
|