segment-anything-model-2

Paused

App Files Files Community

SkalskiP commited on Jul 31

Commit

af5888a

•

1 Parent(s): aa009f7

prompting with boxes added

Browse files

Files changed (3) hide show

app.py +72 -47
requirements.txt +1 -1
utils/models.py +18 -7

app.py CHANGED Viewed

@@ -5,9 +5,10 @@ import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
-from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-from utils.models import load_models, CHECKPOINT_NAMES
 MARKDOWN = """
 # Segment Anything Model 2 🔥
@@ -27,35 +28,50 @@ MARKDOWN = """
 </div>
 Segment Anything Model 2 (SAM 2) is a foundation model designed to address promptable
-visual segmentation in both images and videos. The model extends its functionality to
-video by treating images as single-frame videos. Its design, a simple transformer
-architecture with streaming memory, enables real-time video processing. A
-model-in-the-loop data engine, which enhances the model and data through user
-interaction, was built to collect the SA-V dataset, the largest video segmentation
-dataset to date. SAM 2, trained on this extensive dataset, delivers robust performance
-across diverse tasks and visual domains.
 """
-EXAMPLES = [
-    ["tiny", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 16],
-    ["small", "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 16],
-    ["large", "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 16],
-    ["large", "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 64],
-]
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
-MODELS = load_models(device=DEVICE)
-def process(checkpoint_dropdown, image_input, points_per_side) -> Optional[Image.Image]:
-    model = MODELS[checkpoint_dropdown]
-    mask_generator = SAM2AutomaticMaskGenerator(
-        model=model,
-        points_per_side=points_per_side)
-    image = np.array(image_input.convert("RGB"))
-    sam_result = mask_generator.generate(image)
-    detections = sv.Detections.from_sam(sam_result=sam_result)
-    return MASK_ANNOTATOR.annotate(scene=image_input, detections=detections)
 with gr.Blocks() as demo:
@@ -67,39 +83,48 @@ with gr.Blocks() as demo:
             label="Checkpoint", info="Select a SAM2 checkpoint to use.",
             interactive=True
         )
-        points_per_side_component = gr.Slider(
-            minimum=16,
-            maximum=64,
-            value=16,
-            step=16,
-            label="Points per side",
-            info="the number of points to be sampled along one side of the image."
         )
     with gr.Row():
         with gr.Column():
-            image_input_component = gr.Image(type='pil', label='Upload image')
-            submit_button_component = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_output_component = gr.Image(type='pil', label='Image Output')
-    with gr.Row():
-        gr.Examples(
-            fn=process,
-            examples=EXAMPLES,
-            inputs=[
-                checkpoint_dropdown_component,
-                image_input_component,
-                points_per_side_component
-            ],
-            outputs=[image_output_component],
-            run_on_click=True
-        )
     submit_button_component.click(
         fn=process,
         inputs=[
             checkpoint_dropdown_component,
             image_input_component,
-            points_per_side_component
         ],
         outputs=[image_output_component]
     )

 import supervision as sv
 import torch
 from PIL import Image
+from gradio_image_prompter import ImagePrompter
+from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
+    MASK_GENERATION_MODE, BOX_PROMPT_MODE
 MARKDOWN = """
 # Segment Anything Model 2 🔥
 </div>
 Segment Anything Model 2 (SAM 2) is a foundation model designed to address promptable
+visual segmentation in both images and videos. **Video segmentation will be available
+soon.**
 """
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
+IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)
+def process(
+    checkpoint_dropdown,
+    mode_dropdown,
+    image_input,
+    image_prompter_input
+) -> Optional[Image.Image]:
+    if mode_dropdown == BOX_PROMPT_MODE:
+        image_input = image_prompter_input["image"]
+        prompt = image_prompter_input["points"]
+        if len(prompt) == 0:
+            return image_input
+        model = IMAGE_PREDICTORS[checkpoint_dropdown]
+        image = np.array(image_input.convert("RGB"))
+        box = np.array([[x1, y1, x2, y2] for x1, y1, _, x2, y2, _ in prompt])
+        model.set_image(image)
+        masks, _, _ = model.predict(box=box, multimask_output=False)
+        # dirty fix; remove this later
+        if len(masks.shape) == 4:
+            masks = np.squeeze(masks)
+        detections = sv.Detections(
+            xyxy=sv.mask_to_xyxy(masks=masks),
+            mask=masks.astype(bool)
+        )
+        return MASK_ANNOTATOR.annotate(image_input, detections)
+    if mode_dropdown == MASK_GENERATION_MODE:
+        model = MASK_GENERATORS[checkpoint_dropdown]
+        image = np.array(image_input.convert("RGB"))
+        result = model.generate(image)
+        detections = sv.Detections.from_sam(result)
+        return MASK_ANNOTATOR.annotate(image_input, detections)
 with gr.Blocks() as demo:
             label="Checkpoint", info="Select a SAM2 checkpoint to use.",
             interactive=True
         )
+        mode_dropdown_component = gr.Dropdown(
+            choices=MODE_NAMES,
+            value=MODE_NAMES[0],
+            label="Mode",
+            info="Select a mode to use. `box prompt` if you want to generate masks for "
+                 "selected objects, `mask generation` if you want to generate masks "
+                 "for the whole image.",
+            interactive=True
         )
     with gr.Row():
         with gr.Column():
+            image_input_component = gr.Image(
+                type='pil', label='Upload image', visible=False)
+            image_prompter_input_component = ImagePrompter(
+                type='pil', label='Image prompt')
+            submit_button_component = gr.Button(
+                value='Submit', variant='primary')
         with gr.Column():
             image_output_component = gr.Image(type='pil', label='Image Output')
+    def on_mode_dropdown_change(text):
+        return [
+            gr.Image(visible=text == MASK_GENERATION_MODE),
+            ImagePrompter(visible=text == BOX_PROMPT_MODE)
+        ]
+    mode_dropdown_component.change(
+        on_mode_dropdown_change,
+        inputs=[mode_dropdown_component],
+        outputs=[
+            image_input_component,
+            image_prompter_input_component
+        ]
+    )
     submit_button_component.click(
         fn=process,
         inputs=[
             checkpoint_dropdown_component,
+            mode_dropdown_component,
             image_input_component,
+            image_prompter_input_component,
         ],
         outputs=[image_output_component]
     )

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 samv2
 gradio
 supervision
-gradio_image_annotation
 opencv-python

 samv2
 gradio
 supervision
+gradio_image_prompter
 opencv-python

utils/models.py CHANGED Viewed

@@ -1,10 +1,16 @@
-import torch
-from typing import Dict, Any
 from sam2.build_sam import build_sam2
-CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {
     "tiny": ["sam2_hiera_t.yaml", "checkpoints/sam2_hiera_tiny.pt"],
     "small": ["sam2_hiera_s.yaml", "checkpoints/sam2_hiera_small.pt"],
@@ -13,8 +19,13 @@ CHECKPOINTS = {
 }
-def load_models(device: torch.device) -> Dict[str, Any]:
-    models = {}
     for key, (config, checkpoint) in CHECKPOINTS.items():
-        models[key] = build_sam2(config, checkpoint, device=device, apply_postprocessing=False)
-    return models

+from typing import Dict, Tuple
+import torch
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
 from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+BOX_PROMPT_MODE = "box prompt"
+MASK_GENERATION_MODE = "mask generation"
+VIDEO_SEGMENTATION_MODE = "video segmentation"
+MODE_NAMES = [BOX_PROMPT_MODE, MASK_GENERATION_MODE]
+CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {
     "tiny": ["sam2_hiera_t.yaml", "checkpoints/sam2_hiera_tiny.pt"],
     "small": ["sam2_hiera_s.yaml", "checkpoints/sam2_hiera_small.pt"],
 }
+def load_models(
+    device: torch.device
+) -> Tuple[Dict[str, SAM2ImagePredictor], Dict[str, SAM2AutomaticMaskGenerator]]:
+    image_predictors = {}
+    mask_generators = {}
     for key, (config, checkpoint) in CHECKPOINTS.items():
+        model = build_sam2(config, checkpoint, device=device)
+        image_predictors[key] = SAM2ImagePredictor(sam_model=model)
+        mask_generators[key] = SAM2AutomaticMaskGenerator(model=model)
+    return image_predictors, mask_generators