prithivMLmods commited on
Commit
563a556
·
verified ·
1 Parent(s): e5aa5e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -36,8 +36,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
39
- os.system('pip install backoff')
40
-
41
  # Global constants and helper functions
42
 
43
  MAX_SEED = np.iinfo(np.int32).max
@@ -259,7 +257,15 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
259
  # ------------------------------------------------------------------------------
260
 
261
  DESCRIPTION = """
262
- # Agent Dino 🌠
 
 
 
 
 
 
 
 
263
  """
264
 
265
  css = '''
@@ -469,7 +475,7 @@ def generate(
469
  - "@web": triggers a web search or webpage visit.
470
  - "@rAgent": initiates a reasoning chain using Llama mode.
471
  - "@yolo": triggers object detection using YOLO.
472
- - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model with streaming output.**
473
  """
474
  text = input_dict["text"]
475
  files = input_dict.get("files", [])
@@ -565,7 +571,7 @@ def generate(
565
  yield gr.Image(result_img)
566
  return
567
 
568
- # --- Phi-4 Multimodal branch (Image/Audio) with streaming ---
569
  if text.strip().lower().startswith("@phi4"):
570
  question = text[len("@phi4"):].strip()
571
  if not files:
@@ -574,15 +580,14 @@ def generate(
574
  if not question:
575
  yield "Error: Please provide a question after @phi4."
576
  return
 
577
  # Determine input type (Image or Audio) from the first file
578
  input_file = files[0]
579
  try:
580
- # If file is already a PIL Image, treat as image
581
  if isinstance(input_file, Image.Image):
582
  input_type = "Image"
583
  file_for_phi4 = input_file
584
  else:
585
- # Try opening as image; if it fails, assume audio
586
  try:
587
  file_for_phi4 = Image.open(input_file)
588
  input_type = "Image"
@@ -592,7 +597,7 @@ def generate(
592
  except Exception:
593
  input_type = "Audio"
594
  file_for_phi4 = input_file
595
-
596
  if input_type == "Image":
597
  phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
598
  inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
@@ -603,19 +608,22 @@ def generate(
603
  else:
604
  yield "Invalid file type for @phi4 multimodal processing."
605
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
 
607
- # Set up a streamer for the phi4 model
608
- streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
609
- generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
610
- thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
611
- thread_phi4.start()
612
-
613
- outputs_phi4 = []
614
- yield "🤔 Thinking..."
615
- for new_text in streamer_phi4:
616
- outputs_phi4.append(new_text)
617
- yield "".join(outputs_phi4)
618
- return
619
 
620
  # --- Text and TTS branch ---
621
  tts_prefix = "@tts"
@@ -705,16 +713,15 @@ demo = gr.ChatInterface(
705
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
706
  ],
707
  examples=[
708
- [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
709
- [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
710
  ["@tts2 What causes rainbows to form?"],
711
  ["@image Chocolate dripping from a donut"],
712
  ["@3d A birthday cupcake with cherry"],
713
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
714
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
715
- ["@ragent Explain how a binary search algorithm works."],
716
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
717
  ["@tts1 Explain Tower of Hanoi"],
 
718
  ],
719
  cache_examples=False,
720
  type="messages",
 
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
 
 
39
  # Global constants and helper functions
40
 
41
  MAX_SEED = np.iinfo(np.int32).max
 
257
  # ------------------------------------------------------------------------------
258
 
259
  DESCRIPTION = """
260
+ # Agent Dino 🌠
261
+ This chatbot supports various commands:
262
+ - **@tts1 / @tts2:** text-to-speech
263
+ - **@image:** image generation
264
+ - **@3d:** 3D mesh generation
265
+ - **@web:** web search/visit
266
+ - **@rAgent:** reasoning chain
267
+ - **@yolo:** object detection
268
+ - **@phi4:** multimodal (image/audio) question answering
269
  """
270
 
271
  css = '''
 
475
  - "@web": triggers a web search or webpage visit.
476
  - "@rAgent": initiates a reasoning chain using Llama mode.
477
  - "@yolo": triggers object detection using YOLO.
478
+ - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
479
  """
480
  text = input_dict["text"]
481
  files = input_dict.get("files", [])
 
571
  yield gr.Image(result_img)
572
  return
573
 
574
+ # --- Phi-4 Multimodal branch (Image/Audio) ---
575
  if text.strip().lower().startswith("@phi4"):
576
  question = text[len("@phi4"):].strip()
577
  if not files:
 
580
  if not question:
581
  yield "Error: Please provide a question after @phi4."
582
  return
583
+
584
  # Determine input type (Image or Audio) from the first file
585
  input_file = files[0]
586
  try:
 
587
  if isinstance(input_file, Image.Image):
588
  input_type = "Image"
589
  file_for_phi4 = input_file
590
  else:
 
591
  try:
592
  file_for_phi4 = Image.open(input_file)
593
  input_type = "Image"
 
597
  except Exception:
598
  input_type = "Audio"
599
  file_for_phi4 = input_file
600
+
601
  if input_type == "Image":
602
  phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
603
  inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
 
608
  else:
609
  yield "Invalid file type for @phi4 multimodal processing."
610
  return
611
+
612
+ with torch.no_grad():
613
+ generate_ids = phi4_model.generate(
614
+ **inputs,
615
+ max_new_tokens=200,
616
+ num_logits_to_keep=0,
617
+ streamer=streamer # Adding text streamer
618
+ )
619
+
620
+ buffer = "⚛️ phi4 multimodal is initiated, hold tight"
621
+ for new_text in streamer:
622
+ buffer += new_text
623
+ buffer = buffer.replace("<|im_end|>", "")
624
+ time.sleep(0.01)
625
+ yield buffer
626
 
 
 
 
 
 
 
 
 
 
 
 
 
627
 
628
  # --- Text and TTS branch ---
629
  tts_prefix = "@tts"
 
713
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
714
  ],
715
  examples=[
 
 
716
  ["@tts2 What causes rainbows to form?"],
717
  ["@image Chocolate dripping from a donut"],
718
  ["@3d A birthday cupcake with cherry"],
719
  [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
720
  [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
721
+ ["@rAgent Explain how a binary search algorithm works."],
722
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
723
  ["@tts1 Explain Tower of Hanoi"],
724
+ ["@phi4 What is depicted in this image?"], # Example for @phi4
725
  ],
726
  cache_examples=False,
727
  type="messages",