PHI4-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

40825af

verified ·

1 Parent(s): 035efc4

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -250

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ from threading import Thread
 import base64
 import shutil
 import re
-from io import BytesIO
 import gradio as gr
 import spaces
@@ -18,6 +17,7 @@ import numpy as np
 from PIL import Image
 import edge_tts
 import trimesh
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
@@ -36,17 +36,7 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
-# Additional import for Phi-4 multimodality (audio support)
-import soundfile as sf
-# Install additional dependencies if needed
-os.system('pip install backoff')
-# --- File validation constants ---
-IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
-AUDIO_EXTENSIONS = ['.wav', '.mp3', '.flac', '.ogg']
-# --- Global constants and helper functions ---
 MAX_SEED = np.iinfo(np.int32).max
@@ -56,26 +46,12 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     return seed
 def glb_to_data_url(glb_path: str) -> str:
-    """
-    Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
-    """
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-def load_audio_file(file):
-    """
-    Loads an audio file. If file is a string path, it reads directly.
-    Otherwise, assumes file is a file-like object.
-    """
-    if isinstance(file, str):
-        audio, samplerate = sf.read(file)
-    else:
-        audio, samplerate = sf.read(BytesIO(file.read()))
-    return audio, samplerate
-# --- Model class for Text-to-3D Generation (ShapE) ---
 class Model:
     def __init__(self):
@@ -131,7 +107,7 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# --- New Tools for Web Functionality using DuckDuckGo and smolagents ---
 from typing import Any, Optional
 from smolagents.tools import Tool
@@ -139,43 +115,38 @@ import duckduckgo_search
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
-    description = "Performs a duckduckgo web search based on your query then returns the top search results."
-    inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
     output_type = "string"
     def __init__(self, max_results=10, **kwargs):
         super().__init__()
         self.max_results = max_results
-        try:
-            from duckduckgo_search import DDGS
-        except ImportError as e:
-            raise ImportError("Install duckduckgo-search via pip.") from e
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
             raise Exception("No results found! Try a less restrictive query.")
-        postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
         return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
-    description = "Visits a webpage at the given URL and returns its content as markdown."
-    inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
     def forward(self, url: str) -> str:
-        try:
-            import requests
-            from markdownify import markdownify
-            from requests.exceptions import RequestException
-            from smolagents.utils import truncate_content
-        except ImportError as e:
-            raise ImportError("Install markdownify and requests via pip.") from e
         try:
             response = requests.get(url, timeout=20)
             response.raise_for_status()
@@ -183,13 +154,11 @@ class VisitWebpageTool(Tool):
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
-            return "The request timed out. Please try again later."
-        except RequestException as e:
-            return f"Error fetching the webpage: {str(e)}"
-        except Exception as e:
-            return f"Unexpected error: {str(e)}"
-# --- rAgent Reasoning using Llama mode OpenAI ---
 from openai import OpenAI
@@ -200,13 +169,11 @@ ragent_client = OpenAI(
 )
 SYSTEM_PROMPT = """
-        "You are an expert assistant who solves tasks using Python code. Follow these steps:
-        1. Thought: Explain your reasoning and plan.
-        2. Code: Write Python code to implement your solution.
-        3. Observation: Analyze the output.
-        4. Final Answer: Provide a concise conclusion.
-        Task: {task}"
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
@@ -219,44 +186,23 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
-         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-         max_tokens=max_tokens,
-         stream=True,
-         temperature=temperature,
-         top_p=top_p,
-         messages=messages,
     )
     for message in stream:
-         token = message.choices[0].delta.content
-         response += token
-         yield response
-# --- Gradio UI configuration ---
-DESCRIPTION = """
-# Agent Dino 🌠
-"""
-css = '''
-h1 {
-  text-align: center;
-  display: block;
-}
-#duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-'''
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# --- Load Models and Pipelines for Chat, Image, and Multimodal Processing ---
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -266,12 +212,8 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-TTS_VOICES = [
-    "en-US-JennyNeural",
-    "en-US-GuyNeural",
-]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -279,24 +221,20 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-def clean_chat_history(chat_history):
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
-MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
-USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
-ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -306,10 +244,33 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-if USE_TORCH_COMPILE:
-    sd_pipe.compile()
-if ENABLE_CPU_OFFLOAD:
-    sd_pipe.enable_model_cpu_offload()
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
@@ -346,11 +307,11 @@ def generate_image_fn(
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
-    for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -373,11 +334,6 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
-YOLO_CHECKPOINT_NAME = "images/demo.pt"
-yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
-yolo_detector = YOLODetector(yolo_model_path)
 def detect_objects(image: np.ndarray):
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
@@ -388,57 +344,7 @@ def detect_objects(image: np.ndarray):
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
-# --- Phi-4 Multimodal Model Setup with Text Streaming ---
-phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
-phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
-phi4_model = AutoModelForCausalLM.from_pretrained(
-    phi4_model_path,
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-    _attn_implementation="eager",
-)
-def process_phi4(input_type: str, file: str, question: str, max_new_tokens: int = 200):
-    """
-    Process an image or audio input with the Phi-4 multimodal model.
-    Expects input_type to be either 'image' or 'audio' and file is a file path.
-    """
-    user_prompt = '<|user|>'
-    assistant_prompt = '<|assistant|>'
-    prompt_suffix = '<|end|>'
-    if not file or not question:
-        yield "Please upload a file and provide a question."
-        return
-    try:
-        if input_type == "image":
-            prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
-            image = load_image(file)
-            inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
-        elif input_type == "audio":
-            prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
-            audio, samplerate = load_audio_file(file)
-            inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
-        else:
-            yield "Invalid input type selected. Use 'image' or 'audio'."
-            return
-    except Exception as e:
-        yield f"Error loading file: {str(e)}"
-        return
-    streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    yield "🤔 Thinking..."
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
 @spaces.GPU
 def generate(
@@ -450,58 +356,13 @@ def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
-    """
-    Generates chatbot responses with support for multimodal input and special commands.
-    Special commands include:
-      - "@tts1" or "@tts2": Text-to-speech.
-      - "@image": Image generation using the SDXL pipeline.
-      - "@3d": 3D model generation using the ShapE pipeline.
-      - "@web": Web search or webpage visit.
-      - "@ragent": Reasoning chain using Llama mode.
-      - "@yolo": Object detection using YOLO.
-      - "@phi4": Processes image or audio inputs with the Phi-4 model and streams text output.
-    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # --- Phi-4 Multimodal branch with text streaming ---
-    if text.strip().lower().startswith("@phi4"):
-        parts = text.strip().split(maxsplit=2)
-        if len(parts) < 3:
-            yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
-            return
-        input_type = parts[1].lower()
-        question = parts[2]
-        if not files or len(files) == 0:
-            yield "Error: Please attach an image or audio file for Phi-4 processing."
-            return
-        if len(files) > 1:
-            yield "Warning: Multiple files attached. Only the first file will be processed."
-        file_input = files[0]  # This is a string path from gr.MultimodalTextbox
-        extension = os.path.splitext(file_input)[1].lower()
-        if input_type == "image" and extension not in IMAGE_EXTENSIONS:
-            yield f"Error: Attached file is not an image. Expected extensions: {', '.join(IMAGE_EXTENSIONS)}"
-            return
-        elif input_type == "audio" and extension not in AUDIO_EXTENSIONS:
-            yield f"Error: Attached file is not an audio file. Expected extensions: {', '.join(AUDIO_EXTENSIONS)}"
-            return
-        yield "🔄 Processing multimodal input with Phi-4..."
-        try:
-            for partial in process_phi4(input_type, file_input, question):
-                yield partial
-        except Exception as e:
-            yield f"Error processing file: {str(e)}"
-        return
-    # --- Other branches remain unchanged ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
-        yield "🌀 Hold tight, generating a 3D mesh GLB file....."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
@@ -518,25 +379,20 @@ def generate(
         yield gr.File(new_filepath)
         return
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
-            negative_prompt="",
-            use_negative_prompt=False,
             seed=1,
-            width=1024,
-            height=1024,
-            guidance_scale=3,
-            num_inference_steps=25,
             randomize_seed=True,
-            use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
@@ -547,30 +403,29 @@ def generate(
             yield content
         else:
             query = web_command
-            yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
-        yield "📝 Initiating reasoning chain using Llama mode..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
     if text.strip().lower().startswith("@yolo"):
-        yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
-            yield "Error: Please attach an image for YOLO object detection."
             return
         input_file = files[0]
         try:
-            if isinstance(input_file, str):
-                pil_image = Image.open(input_file)
-            else:
-                pil_image = Image.open(input_file)
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
@@ -579,9 +434,63 @@ def generate(
         yield gr.Image(result_img)
         return
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -591,13 +500,9 @@ def generate(
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
-        if len(files) > 1:
-            images = [load_image(file) for file in files]
-        elif len(files) == 1:
-            images = [load_image(files[0])]
-        else:
-            images = []
         messages = [{
             "role": "user",
             "content": [
@@ -611,6 +516,7 @@ def generate(
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Thinking..."
         for new_text in streamer:
@@ -622,7 +528,7 @@ def generate(
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
@@ -638,43 +544,63 @@ def generate(
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
-        [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@ragent Explain how a binary search algorithm works."],
-        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4 - audio, image, or plain text"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import base64
 import shutil
 import re
 import gradio as gr
 import spaces
 from PIL import Image
 import edge_tts
 import trimesh
+import soundfile as sf  # Added for audio processing with Phi-4
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
+# Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     return seed
 def glb_to_data_url(glb_path: str) -> str:
     with open(glb_path, "rb") as f:
         data = f.read()
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
+# Model class for Text-to-3D Generation (ShapE)
 class Model:
     def __init__(self):
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
+# Web Tools using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
 class DuckDuckGoSearchTool(Tool):
     name = "web_search"
+    description = "Performs a duckduckgo web search and returns the top results."
+    inputs = {'query': {'type': 'string', 'description': 'The search query.'}}
     output_type = "string"
     def __init__(self, max_results=10, **kwargs):
         super().__init__()
         self.max_results = max_results
+        from duckduckgo_search import DDGS
         self.ddgs = DDGS(**kwargs)
     def forward(self, query: str) -> str:
         results = self.ddgs.text(query, max_results=self.max_results)
         if len(results) == 0:
             raise Exception("No results found! Try a less restrictive query.")
+        postprocessed_results = [
+            f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
+        ]
         return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
 class VisitWebpageTool(Tool):
     name = "visit_webpage"
+    description = "Visits a webpage and returns its content as markdown."
+    inputs = {'url': {'type': 'string', 'description': 'The URL to visit.'}}
     output_type = "string"
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
     def forward(self, url: str) -> str:
+        import requests
+        from markdownify import markdownify
+        from smolagents.utils import truncate_content
         try:
             response = requests.get(url, timeout=20)
             response.raise_for_status()
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
+            return "The request timed out."
+        except requests.exceptions.RequestException as e:
+            return f"Error fetching webpage: {str(e)}"
+# rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 )
 SYSTEM_PROMPT = """
+"You are an expert assistant who solves tasks using Python code. Follow these steps:
+1. **Thought**: Explain your reasoning and plan.
+2. **Code**: Write Python code to implement your solution.
+3. **Observation**: Analyze the output and summarize results.
+4. **Final Answer**: Provide a concise conclusion."
 """
 def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
+        messages=messages,
     )
     for message in stream:
+        token = message.choices[0].delta.content
+        response += token
+        yield response
+# Load Models
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Text-only model
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# Multimodal model (Qwen2-VL)
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Phi-4 Multimodal Model
+phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
+phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
+phi4_model = AutoModelForCausalLM.from_pretrained(
+    phi4_model_path,
+    device_map="auto",
+    torch_dtype="auto",
+    trust_remote_code=True,
+    _attn_implementation="eager",
+)
+phi4_model.eval()
+# Stable Diffusion XL Pipeline
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+# YOLO Object Detection
+YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
+YOLO_CHECKPOINT_NAME = "images/demo.pt"
+yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
+yolo_detector = YOLODetector(yolo_model_path)
+# TTS Voices
+TTS_VOICES = ["en-US-JennyNeural", "en-US-GuyNeural"]
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Utility Functions
+async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
+    return output_file
+def clean_chat_history(chat_history):
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
+    for i in range(0, num_images, 1):  # Simplified batching
         batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+1]
+        if "negative_prompt" in batch_options and batch_options["negative_prompt"]:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+1]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 def detect_objects(image: np.ndarray):
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
+# Chat Generation Function with @phi4 Added
 @spaces.GPU
 def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # --- 3D Generation ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
+        yield "🌀 Generating 3D mesh GLB file..."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
         yield gr.File(new_filepath)
         return
+    # --- Image Generation ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "🪧 Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             seed=1,
             randomize_seed=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return
+    # --- Web Search/Visit ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             yield content
         else:
             query = web_command
+            yield "🧤 Performing web search..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
+    # --- rAgent Reasoning ---
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
+        yield "📝 Initiating reasoning chain..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
+    # --- YOLO Object Detection ---
     if text.strip().lower().startswith("@yolo"):
+        yield "🔍 Running object detection..."
         if not files or len(files) == 0:
+            yield "Error: Please attach an image for YOLO."
             return
         input_file = files[0]
         try:
+            pil_image = Image.open(input_file)
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
         yield gr.Image(result_img)
         return
+    # --- Phi-4 Multimodal Branch ---
+    if text.strip().lower().startswith("@phi4"):
+        parts = text[len("@phi4"):].strip().split(maxsplit=1)
+        if len(parts) < 2:
+            yield "Error: Specify input type and question, e.g., '@phi4 image What is this?'"
+            return
+        input_type = parts[0].lower()
+        question = parts[1]
+        if input_type not in ["image", "audio"]:
+            yield "Error: Input type must be 'image' or 'audio'."
+            return
+        if not files or len(files) == 0:
+            yield "Error: Please attach a file for Phi-4 processing."
+            return
+        if len(files) > 1:
+            yield "Warning: Multiple files attached. Using the first one."
+        file_input = files[0]
+        try:
+            if input_type == "image":
+                prompt = f'<|user|><|image_1|>{question}<|end|><|assistant|>'
+                image = Image.open(file_input)
+                inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
+            elif input_type == "audio":
+                prompt = f'<|user|><|audio_1|>{question}<|end|><|assistant|>'
+                audio, samplerate = sf.read(file_input)
+                inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
+            streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+            }
+            thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            yield "🤔 Thinking..."
+            for new_text in streamer:
+                buffer += new_text
+                buffer = buffer.replace("<|im_end|>", "")
+                time.sleep(0.01)
+                yield buffer
+        except Exception as e:
+            yield f"Error processing file: {str(e)}"
+        return
+    # --- Text and TTS Branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
+        images = [load_image(image) for image in files]
         messages = [{
             "role": "user",
             "content": [
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Thinking..."
         for new_text in streamer:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input to {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
+# Gradio Interface
+DESCRIPTION = """
+# Agent Dino 🌠
+Multimodal chatbot with text, image, audio, 3D generation, web search, reasoning, and object detection.
+"""
+css = '''
+h1 { text-align: center; }
+#duplicate-button { margin: auto; color: #fff; background: #1565c0; border-radius: 100vh; }
+'''
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
         gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
+        ["@rAgent Explain how a binary search algorithm works."],
+        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning?"],
         ["@tts1 Explain Tower of Hanoi"],
+        [{"text": "@phi4 image What is shown in this image?", "files": ["examples/image.jpg"]}],
+        [{"text": "@phi4 audio Transcribe this audio.", "files": ["examples/audio.wav"]}],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "audio"],
+        file_count="multiple",
+        placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @phi4-multimodal, default-{text gen}{image-text-text}",
+    ),
     stop_btn="Stop Generation",
     multimodal=True,
 )