PHI4-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

563a556

verified ·

1 Parent(s): e5aa5e4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -23

app.py CHANGED Viewed

@@ -36,8 +36,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
-os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -259,7 +257,15 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
 # ------------------------------------------------------------------------------
 DESCRIPTION = """
-# Agent Dino 🌠
 """
 css = '''
@@ -469,7 +475,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model with streaming output.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -565,7 +571,7 @@ def generate(
         yield gr.Image(result_img)
         return
-    # --- Phi-4 Multimodal branch (Image/Audio) with streaming ---
     if text.strip().lower().startswith("@phi4"):
         question = text[len("@phi4"):].strip()
         if not files:
@@ -574,15 +580,14 @@ def generate(
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
-            # If file is already a PIL Image, treat as image
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
-                # Try opening as image; if it fails, assume audio
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
@@ -592,7 +597,7 @@ def generate(
         except Exception:
             input_type = "Audio"
             file_for_phi4 = input_file
         if input_type == "Image":
             phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
             inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
@@ -603,19 +608,22 @@ def generate(
         else:
             yield "Invalid file type for @phi4 multimodal processing."
             return
-        # Set up a streamer for the phi4 model
-        streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
-        thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
-        thread_phi4.start()
-        outputs_phi4 = []
-        yield "🤔 Thinking..."
-        for new_text in streamer_phi4:
-            outputs_phi4.append(new_text)
-            yield "".join(outputs_phi4)
-        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
@@ -705,16 +713,15 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
-        [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@ragent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",

 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
 # ------------------------------------------------------------------------------
 DESCRIPTION = """
+# Agent Dino 🌠
+This chatbot supports various commands:
+- **@tts1 / @tts2:** text-to-speech
+- **@image:** image generation
+- **@3d:** 3D mesh generation
+- **@web:** web search/visit
+- **@rAgent:** reasoning chain
+- **@yolo:** object detection
+- **@phi4:** multimodal (image/audio) question answering
 """
 css = '''
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
+    # --- Phi-4 Multimodal branch (Image/Audio) ---
     if text.strip().lower().startswith("@phi4"):
         question = text[len("@phi4"):].strip()
         if not files:
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
         except Exception:
             input_type = "Audio"
             file_for_phi4 = input_file
         if input_type == "Image":
             phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
             inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
         else:
             yield "Invalid file type for @phi4 multimodal processing."
             return
+        with torch.no_grad():
+            generate_ids = phi4_model.generate(
+                **inputs,
+                max_new_tokens=200,
+                num_logits_to_keep=0,
+                streamer=streamer  # Adding text streamer
+            )
+        buffer = "⚛️ phi4 multimodal is initiated, hold tight"
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
+        ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
+        ["@phi4 What is depicted in this image?"],  # Example for @phi4
     ],
     cache_examples=False,
     type="messages",