KasKniesmeijer commited on
Commit
ff6b5fc
·
1 Parent(s): cab1df1

improved code

Browse files
Files changed (1) hide show
  1. app.py +49 -9
app.py CHANGED
@@ -1,31 +1,71 @@
1
- import gradio as gr
2
  import torch
 
3
  from transformers import AutoProcessor, AutoModelForVision2Seq
 
 
 
4
 
5
- # Set the device (CPU or CUDA)
6
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
 
8
  # Initialize processor and model
9
  processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
10
  model = AutoModelForVision2Seq.from_pretrained(
11
  "HuggingFaceTB/SmolVLM-Instruct",
12
- torch_dtype=torch.bfloat16,
13
  _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
14
  ).to(DEVICE)
15
 
16
 
17
  # Define the function to answer questions
18
  def answer_question(image, question):
19
- inputs = processor(images=image, text=question, return_tensors="pt").to(DEVICE)
20
- outputs = model.generate(**inputs)
21
- answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
22
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
- # Gradio interface
26
  interface = gr.Interface(
27
  fn=answer_question,
28
- inputs=["image", "text"],
29
  outputs="text",
30
  title="SmolVLM - Vision-Language Question Answering",
31
  description="Upload an image and ask a question to get an answer powered by SmolVLM.",
 
 
1
  import torch
2
+ from PIL import Image
3
  from transformers import AutoProcessor, AutoModelForVision2Seq
4
+ from transformers.image_utils import load_image
5
+ import numpy as np
6
+ import gradio as gr
7
 
8
+ # Set the device (GPU or CPU)
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
  # Initialize processor and model
12
  processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
13
  model = AutoModelForVision2Seq.from_pretrained(
14
  "HuggingFaceTB/SmolVLM-Instruct",
15
+ torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
16
  _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
17
  ).to(DEVICE)
18
 
19
 
20
  # Define the function to answer questions
21
  def answer_question(image, question):
22
+ # Check if the image is provided
23
+ if image is None:
24
+ return "Error: Please upload an image."
25
+
26
+ # Convert NumPy array to PIL Image if necessary
27
+ try:
28
+ if isinstance(image, np.ndarray):
29
+ image = Image.fromarray(image)
30
+ except Exception as e:
31
+ return f"Error: Unable to process the image. {str(e)}"
32
+
33
+ # Ensure question is provided
34
+ if not question.strip():
35
+ return "Error: Please provide a question."
36
+
37
+ # Create input message for the model
38
+ messages = [
39
+ {
40
+ "role": "user",
41
+ "content": [
42
+ {"type": "image", "image": image},
43
+ {"type": "text", "text": question},
44
+ ],
45
+ },
46
+ ]
47
+
48
+ # Apply chat template (this assumes the processor has a chat-based input format)
49
+ try:
50
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
51
+ inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
52
+ except Exception as e:
53
+ return f"Error: Failed to prepare inputs. {str(e)}"
54
+
55
+ # Generate the output
56
+ try:
57
+ generated_ids = model.generate(**inputs, max_new_tokens=500)
58
+ generated_texts = processor.batch_decode(
59
+ generated_ids, skip_special_tokens=True
60
+ )
61
+ return generated_texts[0]
62
+ except Exception as e:
63
+ return f"Error: Failed to generate output. {str(e)}"
64
 
65
 
 
66
  interface = gr.Interface(
67
  fn=answer_question,
68
+ inputs=["image", "text"], # Image and text inputs
69
  outputs="text",
70
  title="SmolVLM - Vision-Language Question Answering",
71
  description="Upload an image and ask a question to get an answer powered by SmolVLM.",