Spaces:

KasKniesmeijer
/

FAAM-demo

Sleeping

App Files Files Community

KasKniesmeijer commited on Dec 12, 2024

Commit

ff6b5fc

1 Parent(s): cab1df1

improved code

Browse files

Files changed (1) hide show

app.py +49 -9

app.py CHANGED Viewed

@@ -1,31 +1,71 @@
-import gradio as gr
 import torch
 from transformers import AutoProcessor, AutoModelForVision2Seq
-# Set the device (CPU or CUDA)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize processor and model
 processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
 model = AutoModelForVision2Seq.from_pretrained(
     "HuggingFaceTB/SmolVLM-Instruct",
-    torch_dtype=torch.bfloat16,
     _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
 ).to(DEVICE)
 # Define the function to answer questions
 def answer_question(image, question):
-    inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE)
-    outputs = model.generate(**inputs)
-    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    return answer
-# Gradio interface
 interface = gr.Interface(
     fn=answer_question,
-    inputs=["image", "text"],
     outputs="text",
     title="SmolVLM - Vision-Language Question Answering",
     description="Upload an image and ask a question to get an answer powered by SmolVLM.",

 import torch
+from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+import numpy as np
+import gradio as gr
+# Set the device (GPU or CPU)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize processor and model
 processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
 model = AutoModelForVision2Seq.from_pretrained(
     "HuggingFaceTB/SmolVLM-Instruct",
+    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
     _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
 ).to(DEVICE)
 # Define the function to answer questions
 def answer_question(image, question):
+    # Check if the image is provided
+    if image is None:
+        return "Error: Please upload an image."
+    # Convert NumPy array to PIL Image if necessary
+    try:
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+    except Exception as e:
+        return f"Error: Unable to process the image. {str(e)}"
+    # Ensure question is provided
+    if not question.strip():
+        return "Error: Please provide a question."
+    # Create input message for the model
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+    # Apply chat template (this assumes the processor has a chat-based input format)
+    try:
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
+    except Exception as e:
+        return f"Error: Failed to prepare inputs. {str(e)}"
+    # Generate the output
+    try:
+        generated_ids = model.generate(**inputs, max_new_tokens=500)
+        generated_texts = processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+        return generated_texts[0]
+    except Exception as e:
+        return f"Error: Failed to generate output. {str(e)}"
 interface = gr.Interface(
     fn=answer_question,
+    inputs=["image", "text"],  # Image and text inputs
     outputs="text",
     title="SmolVLM - Vision-Language Question Answering",
     description="Upload an image and ask a question to get an answer powered by SmolVLM.",