prithivMLmods commited on
Commit
40825af
·
verified ·
1 Parent(s): 035efc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -250
app.py CHANGED
@@ -9,7 +9,6 @@ from threading import Thread
9
  import base64
10
  import shutil
11
  import re
12
- from io import BytesIO
13
 
14
  import gradio as gr
15
  import spaces
@@ -18,6 +17,7 @@ import numpy as np
18
  from PIL import Image
19
  import edge_tts
20
  import trimesh
 
21
 
22
  import supervision as sv
23
  from ultralytics import YOLO as YOLODetector
@@ -36,17 +36,7 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
39
- # Additional import for Phi-4 multimodality (audio support)
40
- import soundfile as sf
41
-
42
- # Install additional dependencies if needed
43
- os.system('pip install backoff')
44
-
45
- # --- File validation constants ---
46
- IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
47
- AUDIO_EXTENSIONS = ['.wav', '.mp3', '.flac', '.ogg']
48
-
49
- # --- Global constants and helper functions ---
50
 
51
  MAX_SEED = np.iinfo(np.int32).max
52
 
@@ -56,26 +46,12 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
56
  return seed
57
 
58
  def glb_to_data_url(glb_path: str) -> str:
59
- """
60
- Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
61
- """
62
  with open(glb_path, "rb") as f:
63
  data = f.read()
64
  b64_data = base64.b64encode(data).decode("utf-8")
65
  return f"data:model/gltf-binary;base64,{b64_data}"
66
 
67
- def load_audio_file(file):
68
- """
69
- Loads an audio file. If file is a string path, it reads directly.
70
- Otherwise, assumes file is a file-like object.
71
- """
72
- if isinstance(file, str):
73
- audio, samplerate = sf.read(file)
74
- else:
75
- audio, samplerate = sf.read(BytesIO(file.read()))
76
- return audio, samplerate
77
-
78
- # --- Model class for Text-to-3D Generation (ShapE) ---
79
 
80
  class Model:
81
  def __init__(self):
@@ -131,7 +107,7 @@ class Model:
131
  export_to_ply(images[0], ply_path.name)
132
  return self.to_glb(ply_path.name)
133
 
134
- # --- New Tools for Web Functionality using DuckDuckGo and smolagents ---
135
 
136
  from typing import Any, Optional
137
  from smolagents.tools import Tool
@@ -139,43 +115,38 @@ import duckduckgo_search
139
 
140
  class DuckDuckGoSearchTool(Tool):
141
  name = "web_search"
142
- description = "Performs a duckduckgo web search based on your query then returns the top search results."
143
- inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
144
  output_type = "string"
145
 
146
  def __init__(self, max_results=10, **kwargs):
147
  super().__init__()
148
  self.max_results = max_results
149
- try:
150
- from duckduckgo_search import DDGS
151
- except ImportError as e:
152
- raise ImportError("Install duckduckgo-search via pip.") from e
153
  self.ddgs = DDGS(**kwargs)
154
 
155
  def forward(self, query: str) -> str:
156
  results = self.ddgs.text(query, max_results=self.max_results)
157
  if len(results) == 0:
158
  raise Exception("No results found! Try a less restrictive query.")
159
- postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
 
 
160
  return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
161
 
162
  class VisitWebpageTool(Tool):
163
  name = "visit_webpage"
164
- description = "Visits a webpage at the given URL and returns its content as markdown."
165
- inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
166
  output_type = "string"
167
 
168
  def __init__(self, *args, **kwargs):
169
  self.is_initialized = False
170
 
171
  def forward(self, url: str) -> str:
172
- try:
173
- import requests
174
- from markdownify import markdownify
175
- from requests.exceptions import RequestException
176
- from smolagents.utils import truncate_content
177
- except ImportError as e:
178
- raise ImportError("Install markdownify and requests via pip.") from e
179
  try:
180
  response = requests.get(url, timeout=20)
181
  response.raise_for_status()
@@ -183,13 +154,11 @@ class VisitWebpageTool(Tool):
183
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
184
  return truncate_content(markdown_content, 10000)
185
  except requests.exceptions.Timeout:
186
- return "The request timed out. Please try again later."
187
- except RequestException as e:
188
- return f"Error fetching the webpage: {str(e)}"
189
- except Exception as e:
190
- return f"Unexpected error: {str(e)}"
191
 
192
- # --- rAgent Reasoning using Llama mode OpenAI ---
193
 
194
  from openai import OpenAI
195
 
@@ -200,13 +169,11 @@ ragent_client = OpenAI(
200
  )
201
 
202
  SYSTEM_PROMPT = """
203
- "You are an expert assistant who solves tasks using Python code. Follow these steps:
204
- 1. Thought: Explain your reasoning and plan.
205
- 2. Code: Write Python code to implement your solution.
206
- 3. Observation: Analyze the output.
207
- 4. Final Answer: Provide a concise conclusion.
208
-
209
- Task: {task}"
210
  """
211
 
212
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
@@ -219,44 +186,23 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
219
  messages.append({"role": "user", "content": prompt})
220
  response = ""
221
  stream = ragent_client.chat.completions.create(
222
- model="meta-llama/Meta-Llama-3.1-8B-Instruct",
223
- max_tokens=max_tokens,
224
- stream=True,
225
- temperature=temperature,
226
- top_p=top_p,
227
- messages=messages,
228
  )
229
  for message in stream:
230
- token = message.choices[0].delta.content
231
- response += token
232
- yield response
233
 
234
- # --- Gradio UI configuration ---
235
-
236
- DESCRIPTION = """
237
- # Agent Dino 🌠
238
- """
239
-
240
- css = '''
241
- h1 {
242
- text-align: center;
243
- display: block;
244
- }
245
- #duplicate-button {
246
- margin: auto;
247
- color: #fff;
248
- background: #1565c0;
249
- border-radius: 100vh;
250
- }
251
- '''
252
-
253
- MAX_MAX_NEW_TOKENS = 2048
254
- DEFAULT_MAX_NEW_TOKENS = 1024
255
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
256
 
257
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
258
 
259
- # --- Load Models and Pipelines for Chat, Image, and Multimodal Processing ---
260
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
261
  tokenizer = AutoTokenizer.from_pretrained(model_id)
262
  model = AutoModelForCausalLM.from_pretrained(
@@ -266,12 +212,8 @@ model = AutoModelForCausalLM.from_pretrained(
266
  )
267
  model.eval()
268
 
269
- TTS_VOICES = [
270
- "en-US-JennyNeural",
271
- "en-US-GuyNeural",
272
- ]
273
-
274
- MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
275
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
276
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
277
  MODEL_ID,
@@ -279,24 +221,20 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
279
  torch_dtype=torch.float16
280
  ).to("cuda").eval()
281
 
282
- async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
283
- communicate = edge_tts.Communicate(text, voice)
284
- await communicate.save(output_file)
285
- return output_file
286
-
287
- def clean_chat_history(chat_history):
288
- cleaned = []
289
- for msg in chat_history:
290
- if isinstance(msg, dict) and isinstance(msg.get("content"), str):
291
- cleaned.append(msg)
292
- return cleaned
293
 
 
294
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
295
- MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
296
- USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
297
- ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
298
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
299
-
300
  sd_pipe = StableDiffusionXLPipeline.from_pretrained(
301
  MODEL_ID_SD,
302
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -306,10 +244,33 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
306
  sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
307
  if torch.cuda.is_available():
308
  sd_pipe.text_encoder = sd_pipe.text_encoder.half()
309
- if USE_TORCH_COMPILE:
310
- sd_pipe.compile()
311
- if ENABLE_CPU_OFFLOAD:
312
- sd_pipe.enable_model_cpu_offload()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
  def save_image(img: Image.Image) -> str:
315
  unique_name = str(uuid.uuid4()) + ".png"
@@ -346,11 +307,11 @@ def generate_image_fn(
346
  if use_resolution_binning:
347
  options["use_resolution_binning"] = True
348
  images = []
349
- for i in range(0, num_images, BATCH_SIZE):
350
  batch_options = options.copy()
351
- batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
352
- if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
353
- batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
354
  if device.type == "cuda":
355
  with torch.autocast("cuda", dtype=torch.float16):
356
  outputs = sd_pipe(**batch_options)
@@ -373,11 +334,6 @@ def generate_3d_fn(
373
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
374
  return glb_path, seed
375
 
376
- YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
377
- YOLO_CHECKPOINT_NAME = "images/demo.pt"
378
- yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
379
- yolo_detector = YOLODetector(yolo_model_path)
380
-
381
  def detect_objects(image: np.ndarray):
382
  results = yolo_detector(image, verbose=False)[0]
383
  detections = sv.Detections.from_ultralytics(results).with_nms()
@@ -388,57 +344,7 @@ def detect_objects(image: np.ndarray):
388
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
389
  return Image.fromarray(annotated_image)
390
 
391
- # --- Phi-4 Multimodal Model Setup with Text Streaming ---
392
- phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
393
- phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
394
- phi4_model = AutoModelForCausalLM.from_pretrained(
395
- phi4_model_path,
396
- device_map="auto",
397
- torch_dtype="auto",
398
- trust_remote_code=True,
399
- _attn_implementation="eager",
400
- )
401
-
402
- def process_phi4(input_type: str, file: str, question: str, max_new_tokens: int = 200):
403
- """
404
- Process an image or audio input with the Phi-4 multimodal model.
405
- Expects input_type to be either 'image' or 'audio' and file is a file path.
406
- """
407
- user_prompt = '<|user|>'
408
- assistant_prompt = '<|assistant|>'
409
- prompt_suffix = '<|end|>'
410
-
411
- if not file or not question:
412
- yield "Please upload a file and provide a question."
413
- return
414
-
415
- try:
416
- if input_type == "image":
417
- prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
418
- image = load_image(file)
419
- inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
420
- elif input_type == "audio":
421
- prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
422
- audio, samplerate = load_audio_file(file)
423
- inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
424
- else:
425
- yield "Invalid input type selected. Use 'image' or 'audio'."
426
- return
427
- except Exception as e:
428
- yield f"Error loading file: {str(e)}"
429
- return
430
-
431
- streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
432
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
433
- thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
434
- thread.start()
435
- buffer = ""
436
- yield "🤔 Thinking..."
437
- for new_text in streamer:
438
- buffer += new_text
439
- buffer = buffer.replace("<|im_end|>", "")
440
- time.sleep(0.01)
441
- yield buffer
442
 
443
  @spaces.GPU
444
  def generate(
@@ -450,58 +356,13 @@ def generate(
450
  top_k: int = 50,
451
  repetition_penalty: float = 1.2,
452
  ):
453
- """
454
- Generates chatbot responses with support for multimodal input and special commands.
455
- Special commands include:
456
- - "@tts1" or "@tts2": Text-to-speech.
457
- - "@image": Image generation using the SDXL pipeline.
458
- - "@3d": 3D model generation using the ShapE pipeline.
459
- - "@web": Web search or webpage visit.
460
- - "@ragent": Reasoning chain using Llama mode.
461
- - "@yolo": Object detection using YOLO.
462
- - "@phi4": Processes image or audio inputs with the Phi-4 model and streams text output.
463
- """
464
  text = input_dict["text"]
465
  files = input_dict.get("files", [])
466
 
467
- # --- Phi-4 Multimodal branch with text streaming ---
468
- if text.strip().lower().startswith("@phi4"):
469
- parts = text.strip().split(maxsplit=2)
470
- if len(parts) < 3:
471
- yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
472
- return
473
- input_type = parts[1].lower()
474
- question = parts[2]
475
-
476
- if not files or len(files) == 0:
477
- yield "Error: Please attach an image or audio file for Phi-4 processing."
478
- return
479
-
480
- if len(files) > 1:
481
- yield "Warning: Multiple files attached. Only the first file will be processed."
482
-
483
- file_input = files[0] # This is a string path from gr.MultimodalTextbox
484
-
485
- extension = os.path.splitext(file_input)[1].lower()
486
- if input_type == "image" and extension not in IMAGE_EXTENSIONS:
487
- yield f"Error: Attached file is not an image. Expected extensions: {', '.join(IMAGE_EXTENSIONS)}"
488
- return
489
- elif input_type == "audio" and extension not in AUDIO_EXTENSIONS:
490
- yield f"Error: Attached file is not an audio file. Expected extensions: {', '.join(AUDIO_EXTENSIONS)}"
491
- return
492
-
493
- yield "🔄 Processing multimodal input with Phi-4..."
494
- try:
495
- for partial in process_phi4(input_type, file_input, question):
496
- yield partial
497
- except Exception as e:
498
- yield f"Error processing file: {str(e)}"
499
- return
500
-
501
- # --- Other branches remain unchanged ---
502
  if text.strip().lower().startswith("@3d"):
503
  prompt = text[len("@3d"):].strip()
504
- yield "🌀 Hold tight, generating a 3D mesh GLB file....."
505
  glb_path, used_seed = generate_3d_fn(
506
  prompt=prompt,
507
  seed=1,
@@ -518,25 +379,20 @@ def generate(
518
  yield gr.File(new_filepath)
519
  return
520
 
 
521
  if text.strip().lower().startswith("@image"):
522
  prompt = text[len("@image"):].strip()
523
  yield "🪧 Generating image..."
524
  image_paths, used_seed = generate_image_fn(
525
  prompt=prompt,
526
- negative_prompt="",
527
- use_negative_prompt=False,
528
  seed=1,
529
- width=1024,
530
- height=1024,
531
- guidance_scale=3,
532
- num_inference_steps=25,
533
  randomize_seed=True,
534
- use_resolution_binning=True,
535
  num_images=1,
536
  )
537
  yield gr.Image(image_paths[0])
538
  return
539
 
 
540
  if text.strip().lower().startswith("@web"):
541
  web_command = text[len("@web"):].strip()
542
  if web_command.lower().startswith("visit"):
@@ -547,30 +403,29 @@ def generate(
547
  yield content
548
  else:
549
  query = web_command
550
- yield "🧤 Performing a web search ..."
551
  searcher = DuckDuckGoSearchTool()
552
  results = searcher.forward(query)
553
  yield results
554
  return
555
 
 
556
  if text.strip().lower().startswith("@ragent"):
557
  prompt = text[len("@ragent"):].strip()
558
- yield "📝 Initiating reasoning chain using Llama mode..."
559
  for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
560
  yield partial
561
  return
562
 
 
563
  if text.strip().lower().startswith("@yolo"):
564
- yield "🔍 Running object detection with YOLO..."
565
  if not files or len(files) == 0:
566
- yield "Error: Please attach an image for YOLO object detection."
567
  return
568
  input_file = files[0]
569
  try:
570
- if isinstance(input_file, str):
571
- pil_image = Image.open(input_file)
572
- else:
573
- pil_image = Image.open(input_file)
574
  except Exception as e:
575
  yield f"Error loading image: {str(e)}"
576
  return
@@ -579,9 +434,63 @@ def generate(
579
  yield gr.Image(result_img)
580
  return
581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
  tts_prefix = "@tts"
583
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
584
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
 
585
  if is_tts and voice_index:
586
  voice = TTS_VOICES[voice_index - 1]
587
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -591,13 +500,9 @@ def generate(
591
  text = text.replace(tts_prefix, "").strip()
592
  conversation = clean_chat_history(chat_history)
593
  conversation.append({"role": "user", "content": text})
 
594
  if files:
595
- if len(files) > 1:
596
- images = [load_image(file) for file in files]
597
- elif len(files) == 1:
598
- images = [load_image(files[0])]
599
- else:
600
- images = []
601
  messages = [{
602
  "role": "user",
603
  "content": [
@@ -611,6 +516,7 @@ def generate(
611
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
612
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
613
  thread.start()
 
614
  buffer = ""
615
  yield "🤔 Thinking..."
616
  for new_text in streamer:
@@ -622,7 +528,7 @@ def generate(
622
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
623
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
624
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
625
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
626
  input_ids = input_ids.to(model.device)
627
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
628
  generation_kwargs = {
@@ -638,43 +544,63 @@ def generate(
638
  }
639
  t = Thread(target=model.generate, kwargs=generation_kwargs)
640
  t.start()
 
641
  outputs = []
642
  for new_text in streamer:
643
  outputs.append(new_text)
644
  yield "".join(outputs)
 
645
  final_response = "".join(outputs)
646
  yield final_response
 
647
  if is_tts and voice:
648
  output_file = asyncio.run(text_to_speech(final_response, voice))
649
  yield gr.Audio(output_file, autoplay=True)
650
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  demo = gr.ChatInterface(
652
  fn=generate,
653
  additional_inputs=[
654
  gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
655
  gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
656
- gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
657
  gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
658
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
659
  ],
660
  examples=[
661
- [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
662
- [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
663
  ["@tts2 What causes rainbows to form?"],
664
  ["@image Chocolate dripping from a donut"],
665
  ["@3d A birthday cupcake with cherry"],
666
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
667
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
668
- ["@ragent Explain how a binary search algorithm works."],
669
- ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
670
  ["@tts1 Explain Tower of Hanoi"],
 
 
671
  ],
672
  cache_examples=False,
673
  type="messages",
674
  description=DESCRIPTION,
675
  css=css,
676
  fill_height=True,
677
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4 - audio, image, or plain text"),
 
 
 
 
 
678
  stop_btn="Stop Generation",
679
  multimodal=True,
680
  )
 
9
  import base64
10
  import shutil
11
  import re
 
12
 
13
  import gradio as gr
14
  import spaces
 
17
  from PIL import Image
18
  import edge_tts
19
  import trimesh
20
+ import soundfile as sf # Added for audio processing with Phi-4
21
 
22
  import supervision as sv
23
  from ultralytics import YOLO as YOLODetector
 
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
39
+ # Global constants and helper functions
 
 
 
 
 
 
 
 
 
 
40
 
41
  MAX_SEED = np.iinfo(np.int32).max
42
 
 
46
  return seed
47
 
48
  def glb_to_data_url(glb_path: str) -> str:
 
 
 
49
  with open(glb_path, "rb") as f:
50
  data = f.read()
51
  b64_data = base64.b64encode(data).decode("utf-8")
52
  return f"data:model/gltf-binary;base64,{b64_data}"
53
 
54
+ # Model class for Text-to-3D Generation (ShapE)
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  class Model:
57
  def __init__(self):
 
107
  export_to_ply(images[0], ply_path.name)
108
  return self.to_glb(ply_path.name)
109
 
110
+ # Web Tools using DuckDuckGo and smolagents
111
 
112
  from typing import Any, Optional
113
  from smolagents.tools import Tool
 
115
 
116
  class DuckDuckGoSearchTool(Tool):
117
  name = "web_search"
118
+ description = "Performs a duckduckgo web search and returns the top results."
119
+ inputs = {'query': {'type': 'string', 'description': 'The search query.'}}
120
  output_type = "string"
121
 
122
  def __init__(self, max_results=10, **kwargs):
123
  super().__init__()
124
  self.max_results = max_results
125
+ from duckduckgo_search import DDGS
 
 
 
126
  self.ddgs = DDGS(**kwargs)
127
 
128
  def forward(self, query: str) -> str:
129
  results = self.ddgs.text(query, max_results=self.max_results)
130
  if len(results) == 0:
131
  raise Exception("No results found! Try a less restrictive query.")
132
+ postprocessed_results = [
133
+ f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
134
+ ]
135
  return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
136
 
137
  class VisitWebpageTool(Tool):
138
  name = "visit_webpage"
139
+ description = "Visits a webpage and returns its content as markdown."
140
+ inputs = {'url': {'type': 'string', 'description': 'The URL to visit.'}}
141
  output_type = "string"
142
 
143
  def __init__(self, *args, **kwargs):
144
  self.is_initialized = False
145
 
146
  def forward(self, url: str) -> str:
147
+ import requests
148
+ from markdownify import markdownify
149
+ from smolagents.utils import truncate_content
 
 
 
 
150
  try:
151
  response = requests.get(url, timeout=20)
152
  response.raise_for_status()
 
154
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
155
  return truncate_content(markdown_content, 10000)
156
  except requests.exceptions.Timeout:
157
+ return "The request timed out."
158
+ except requests.exceptions.RequestException as e:
159
+ return f"Error fetching webpage: {str(e)}"
 
 
160
 
161
+ # rAgent Reasoning using Llama mode OpenAI
162
 
163
  from openai import OpenAI
164
 
 
169
  )
170
 
171
  SYSTEM_PROMPT = """
172
+ "You are an expert assistant who solves tasks using Python code. Follow these steps:
173
+ 1. **Thought**: Explain your reasoning and plan.
174
+ 2. **Code**: Write Python code to implement your solution.
175
+ 3. **Observation**: Analyze the output and summarize results.
176
+ 4. **Final Answer**: Provide a concise conclusion."
 
 
177
  """
178
 
179
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
 
186
  messages.append({"role": "user", "content": prompt})
187
  response = ""
188
  stream = ragent_client.chat.completions.create(
189
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
190
+ max_tokens=max_tokens,
191
+ stream=True,
192
+ temperature=temperature,
193
+ top_p=top_p,
194
+ messages=messages,
195
  )
196
  for message in stream:
197
+ token = message.choices[0].delta.content
198
+ response += token
199
+ yield response
200
 
201
+ # Load Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
204
 
205
+ # Text-only model
206
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
207
  tokenizer = AutoTokenizer.from_pretrained(model_id)
208
  model = AutoModelForCausalLM.from_pretrained(
 
212
  )
213
  model.eval()
214
 
215
+ # Multimodal model (Qwen2-VL)
216
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 
 
 
 
217
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
218
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
219
  MODEL_ID,
 
221
  torch_dtype=torch.float16
222
  ).to("cuda").eval()
223
 
224
+ # Phi-4 Multimodal Model
225
+ phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
226
+ phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
227
+ phi4_model = AutoModelForCausalLM.from_pretrained(
228
+ phi4_model_path,
229
+ device_map="auto",
230
+ torch_dtype="auto",
231
+ trust_remote_code=True,
232
+ _attn_implementation="eager",
233
+ )
234
+ phi4_model.eval()
235
 
236
+ # Stable Diffusion XL Pipeline
237
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 
 
 
 
 
238
  sd_pipe = StableDiffusionXLPipeline.from_pretrained(
239
  MODEL_ID_SD,
240
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 
244
  sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
245
  if torch.cuda.is_available():
246
  sd_pipe.text_encoder = sd_pipe.text_encoder.half()
247
+
248
+ # YOLO Object Detection
249
+ YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
250
+ YOLO_CHECKPOINT_NAME = "images/demo.pt"
251
+ yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
252
+ yolo_detector = YOLODetector(yolo_model_path)
253
+
254
+ # TTS Voices
255
+ TTS_VOICES = ["en-US-JennyNeural", "en-US-GuyNeural"]
256
+
257
+ MAX_MAX_NEW_TOKENS = 2048
258
+ DEFAULT_MAX_NEW_TOKENS = 1024
259
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
260
+
261
+ # Utility Functions
262
+
263
+ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
264
+ communicate = edge_tts.Communicate(text, voice)
265
+ await communicate.save(output_file)
266
+ return output_file
267
+
268
+ def clean_chat_history(chat_history):
269
+ cleaned = []
270
+ for msg in chat_history:
271
+ if isinstance(msg, dict) and isinstance(msg.get("content"), str):
272
+ cleaned.append(msg)
273
+ return cleaned
274
 
275
  def save_image(img: Image.Image) -> str:
276
  unique_name = str(uuid.uuid4()) + ".png"
 
307
  if use_resolution_binning:
308
  options["use_resolution_binning"] = True
309
  images = []
310
+ for i in range(0, num_images, 1): # Simplified batching
311
  batch_options = options.copy()
312
+ batch_options["prompt"] = options["prompt"][i:i+1]
313
+ if "negative_prompt" in batch_options and batch_options["negative_prompt"]:
314
+ batch_options["negative_prompt"] = options["negative_prompt"][i:i+1]
315
  if device.type == "cuda":
316
  with torch.autocast("cuda", dtype=torch.float16):
317
  outputs = sd_pipe(**batch_options)
 
334
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
335
  return glb_path, seed
336
 
 
 
 
 
 
337
  def detect_objects(image: np.ndarray):
338
  results = yolo_detector(image, verbose=False)[0]
339
  detections = sv.Detections.from_ultralytics(results).with_nms()
 
344
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
345
  return Image.fromarray(annotated_image)
346
 
347
+ # Chat Generation Function with @phi4 Added
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  @spaces.GPU
350
  def generate(
 
356
  top_k: int = 50,
357
  repetition_penalty: float = 1.2,
358
  ):
 
 
 
 
 
 
 
 
 
 
 
359
  text = input_dict["text"]
360
  files = input_dict.get("files", [])
361
 
362
+ # --- 3D Generation ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  if text.strip().lower().startswith("@3d"):
364
  prompt = text[len("@3d"):].strip()
365
+ yield "🌀 Generating 3D mesh GLB file..."
366
  glb_path, used_seed = generate_3d_fn(
367
  prompt=prompt,
368
  seed=1,
 
379
  yield gr.File(new_filepath)
380
  return
381
 
382
+ # --- Image Generation ---
383
  if text.strip().lower().startswith("@image"):
384
  prompt = text[len("@image"):].strip()
385
  yield "🪧 Generating image..."
386
  image_paths, used_seed = generate_image_fn(
387
  prompt=prompt,
 
 
388
  seed=1,
 
 
 
 
389
  randomize_seed=True,
 
390
  num_images=1,
391
  )
392
  yield gr.Image(image_paths[0])
393
  return
394
 
395
+ # --- Web Search/Visit ---
396
  if text.strip().lower().startswith("@web"):
397
  web_command = text[len("@web"):].strip()
398
  if web_command.lower().startswith("visit"):
 
403
  yield content
404
  else:
405
  query = web_command
406
+ yield "🧤 Performing web search..."
407
  searcher = DuckDuckGoSearchTool()
408
  results = searcher.forward(query)
409
  yield results
410
  return
411
 
412
+ # --- rAgent Reasoning ---
413
  if text.strip().lower().startswith("@ragent"):
414
  prompt = text[len("@ragent"):].strip()
415
+ yield "📝 Initiating reasoning chain..."
416
  for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
417
  yield partial
418
  return
419
 
420
+ # --- YOLO Object Detection ---
421
  if text.strip().lower().startswith("@yolo"):
422
+ yield "🔍 Running object detection..."
423
  if not files or len(files) == 0:
424
+ yield "Error: Please attach an image for YOLO."
425
  return
426
  input_file = files[0]
427
  try:
428
+ pil_image = Image.open(input_file)
 
 
 
429
  except Exception as e:
430
  yield f"Error loading image: {str(e)}"
431
  return
 
434
  yield gr.Image(result_img)
435
  return
436
 
437
+ # --- Phi-4 Multimodal Branch ---
438
+ if text.strip().lower().startswith("@phi4"):
439
+ parts = text[len("@phi4"):].strip().split(maxsplit=1)
440
+ if len(parts) < 2:
441
+ yield "Error: Specify input type and question, e.g., '@phi4 image What is this?'"
442
+ return
443
+ input_type = parts[0].lower()
444
+ question = parts[1]
445
+
446
+ if input_type not in ["image", "audio"]:
447
+ yield "Error: Input type must be 'image' or 'audio'."
448
+ return
449
+
450
+ if not files or len(files) == 0:
451
+ yield "Error: Please attach a file for Phi-4 processing."
452
+ return
453
+
454
+ if len(files) > 1:
455
+ yield "Warning: Multiple files attached. Using the first one."
456
+
457
+ file_input = files[0]
458
+
459
+ try:
460
+ if input_type == "image":
461
+ prompt = f'<|user|><|image_1|>{question}<|end|><|assistant|>'
462
+ image = Image.open(file_input)
463
+ inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
464
+ elif input_type == "audio":
465
+ prompt = f'<|user|><|audio_1|>{question}<|end|><|assistant|>'
466
+ audio, samplerate = sf.read(file_input)
467
+ inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
468
+
469
+ streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
470
+ generation_kwargs = {
471
+ **inputs,
472
+ "streamer": streamer,
473
+ "max_new_tokens": max_new_tokens,
474
+ }
475
+ thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
476
+ thread.start()
477
+
478
+ buffer = ""
479
+ yield "🤔 Thinking..."
480
+ for new_text in streamer:
481
+ buffer += new_text
482
+ buffer = buffer.replace("<|im_end|>", "")
483
+ time.sleep(0.01)
484
+ yield buffer
485
+ except Exception as e:
486
+ yield f"Error processing file: {str(e)}"
487
+ return
488
+
489
+ # --- Text and TTS Branch ---
490
  tts_prefix = "@tts"
491
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
492
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
493
+
494
  if is_tts and voice_index:
495
  voice = TTS_VOICES[voice_index - 1]
496
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
 
500
  text = text.replace(tts_prefix, "").strip()
501
  conversation = clean_chat_history(chat_history)
502
  conversation.append({"role": "user", "content": text})
503
+
504
  if files:
505
+ images = [load_image(image) for image in files]
 
 
 
 
 
506
  messages = [{
507
  "role": "user",
508
  "content": [
 
516
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
517
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
518
  thread.start()
519
+
520
  buffer = ""
521
  yield "🤔 Thinking..."
522
  for new_text in streamer:
 
528
  input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
529
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
530
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
531
+ gr.Warning(f"Trimmed input to {MAX_INPUT_TOKEN_LENGTH} tokens.")
532
  input_ids = input_ids.to(model.device)
533
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
534
  generation_kwargs = {
 
544
  }
545
  t = Thread(target=model.generate, kwargs=generation_kwargs)
546
  t.start()
547
+
548
  outputs = []
549
  for new_text in streamer:
550
  outputs.append(new_text)
551
  yield "".join(outputs)
552
+
553
  final_response = "".join(outputs)
554
  yield final_response
555
+
556
  if is_tts and voice:
557
  output_file = asyncio.run(text_to_speech(final_response, voice))
558
  yield gr.Audio(output_file, autoplay=True)
559
 
560
+ # Gradio Interface
561
+
562
+ DESCRIPTION = """
563
+ # Agent Dino 🌠
564
+ Multimodal chatbot with text, image, audio, 3D generation, web search, reasoning, and object detection.
565
+ """
566
+
567
+ css = '''
568
+ h1 { text-align: center; }
569
+ #duplicate-button { margin: auto; color: #fff; background: #1565c0; border-radius: 100vh; }
570
+ '''
571
+
572
  demo = gr.ChatInterface(
573
  fn=generate,
574
  additional_inputs=[
575
  gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
576
  gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
577
+ gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
578
  gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
579
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
580
  ],
581
  examples=[
 
 
582
  ["@tts2 What causes rainbows to form?"],
583
  ["@image Chocolate dripping from a donut"],
584
  ["@3d A birthday cupcake with cherry"],
585
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
586
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
587
+ ["@rAgent Explain how a binary search algorithm works."],
588
+ ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning?"],
589
  ["@tts1 Explain Tower of Hanoi"],
590
+ [{"text": "@phi4 image What is shown in this image?", "files": ["examples/image.jpg"]}],
591
+ [{"text": "@phi4 audio Transcribe this audio.", "files": ["examples/audio.wav"]}],
592
  ],
593
  cache_examples=False,
594
  type="messages",
595
  description=DESCRIPTION,
596
  css=css,
597
  fill_height=True,
598
+ textbox=gr.MultimodalTextbox(
599
+ label="Query Input",
600
+ file_types=["image", "audio"],
601
+ file_count="multiple",
602
+ placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @phi4-multimodal, default-{text gen}{image-text-text}",
603
+ ),
604
  stop_btn="Stop Generation",
605
  multimodal=True,
606
  )