segment-anything-model-2

Paused

App Files Files Community

SkalskiP commited on Jul 31, 2024

Commit

7761031

1 Parent(s): aabd771

Revert "working on video inference"

Browse files

This reverts commit aabd7712744df069cda860abd140284cf78b5f6d.

Files changed (4) hide show

app.py +17 -133
requirements.txt +0 -1
utils/models.py +1 -1
utils/video.py +0 -14

app.py CHANGED Viewed

@@ -1,19 +1,14 @@
-import os
 from typing import Optional
-import cv2
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
-from tqdm import tqdm
 from gradio_image_prompter import ImagePrompter
 from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
-    MASK_GENERATION_MODE, BOX_PROMPT_MODE, VIDEO_SEGMENTATION_MODE
-from utils.video import create_directory, generate_unique_name
-from sam2.build_sam import build_sam2_video_predictor
 MARKDOWN = """
 # Segment Anything Model 2 🔥
@@ -36,7 +31,6 @@ Segment Anything Model 2 (SAM 2) is a foundation model designed to address promp
 visual segmentation in both images and videos. **Video segmentation will be available
 soon.**
 """
 EXAMPLES = [
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
@@ -47,37 +41,8 @@ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
 IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)
-SCALE_FACTOR = 0.5
-TARGET_DIRECTORY = "tmp"
-# creating video results directory
-create_directory(directory_path=TARGET_DIRECTORY)
-def on_mode_dropdown_change(text):
-    return [
-        gr.Image(visible=text == MASK_GENERATION_MODE),
-        ImagePrompter(visible=text == BOX_PROMPT_MODE),
-        gr.Video(visible=text == VIDEO_SEGMENTATION_MODE),
-        ImagePrompter(visible=text == VIDEO_SEGMENTATION_MODE),
-        gr.Button(visible=text != VIDEO_SEGMENTATION_MODE),
-        gr.Button(visible=text == VIDEO_SEGMENTATION_MODE),
-        gr.Image(visible=text != VIDEO_SEGMENTATION_MODE),
-        gr.Video(visible=text == VIDEO_SEGMENTATION_MODE)
-    ]
-def on_video_input_change(video_input):
-    if not video_input:
-        return None
-    frames_generator = sv.get_video_frames_generator(video_input)
-    frame = next(frames_generator)
-    frame = sv.scale_image(frame, SCALE_FACTOR)
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    frame = Image.fromarray(frame)
-    return {'image': frame, 'points': []}
-def process_image(
     checkpoint_dropdown,
     mode_dropdown,
     image_input,
@@ -114,64 +79,6 @@ def process_image(
         return MASK_ANNOTATOR.annotate(image_input, detections)
-def process_video(
-    checkpoint_dropdown,
-    mode_dropdown,
-    video_input,
-    video_prompter_input,
-    progress=gr.Progress(track_tqdm=True)
-) -> str:
-    if mode_dropdown != VIDEO_SEGMENTATION_MODE:
-        return str(video_input)
-    name = generate_unique_name()
-    frame_directory_path = os.path.join(TARGET_DIRECTORY, name)
-    frames_sink = sv.ImageSink(
-        target_dir_path=frame_directory_path,
-        image_name_pattern="{:05d}.jpeg"
-    )
-    video_info = sv.VideoInfo.from_video_path(video_input)
-    frames_generator = sv.get_video_frames_generator(video_input)
-    with frames_sink:
-        for frame in tqdm(
-                frames_generator,
-                total=video_info.total_frames,
-                desc="splitting video into frames"
-        ):
-            frame = sv.scale_image(frame, SCALE_FACTOR)
-            frames_sink.save_image(frame)
-    model = build_sam2_video_predictor(
-        "sam2_hiera_t.yaml",
-        "checkpoints/sam2_hiera_tiny.pt",
-        device=DEVICE
-    )
-    inference_state = model.init_state(
-        video_path=frame_directory_path,
-        offload_video_to_cpu=DEVICE == torch.device('cpu'),
-        offload_state_to_cpu=DEVICE == torch.device('cpu'),
-    )
-    prompt = video_prompter_input["points"]
-    points = np.array([[x1, y1] for x1, y1, _, _, _, _ in prompt])
-    labels = np.ones(len(points))
-    _, object_ids, mask_logits = model.add_new_points(
-        inference_state=inference_state,
-        frame_idx=0,
-        obj_id=1,
-        points=points,
-        labels=labels,
-    )
-    del inference_state
-    del model
-    video_path = os.path.join(TARGET_DIRECTORY, f"{name}.mp4")
-    return str(video_input)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
@@ -187,8 +94,7 @@ with gr.Blocks() as demo:
             label="Mode",
             info="Select a mode to use. `box prompt` if you want to generate masks for "
                  "selected objects, `mask generation` if you want to generate masks "
-                 "for the whole image, and `video segmentation` if you want to track "
-                 "object on video.",
             interactive=True
         )
     with gr.Row():
@@ -196,22 +102,14 @@ with gr.Blocks() as demo:
             image_input_component = gr.Image(
                 type='pil', label='Upload image', visible=False)
             image_prompter_input_component = ImagePrompter(
-                type='pil', label='Prompt image')
-            video_input_component = gr.Video(
-                label='Step 1: Upload video', visible=False)
-            video_prompter_input_component = ImagePrompter(
-                type='pil', label='Step 2: Prompt frame', visible=False)
-            submit_image_button_component = gr.Button(
                 value='Submit', variant='primary')
-            submit_video_button_component = gr.Button(
-                value='Submit', variant='primary', visible=False)
         with gr.Column():
-            image_output_component = gr.Image(type='pil', label='Image output')
-            video_output_component = gr.Video(
-                label='Step 2: Video output', visible=False)
     with gr.Row():
         gr.Examples(
-            fn=process_image,
             examples=EXAMPLES,
             inputs=[
                 checkpoint_dropdown_component,
@@ -223,27 +121,23 @@ with gr.Blocks() as demo:
             run_on_click=True
         )
     mode_dropdown_component.change(
         on_mode_dropdown_change,
         inputs=[mode_dropdown_component],
         outputs=[
             image_input_component,
-            image_prompter_input_component,
-            video_input_component,
-            video_prompter_input_component,
-            submit_image_button_component,
-            submit_video_button_component,
-            image_output_component,
-            video_output_component
         ]
     )
-    video_input_component.change(
-        fn=on_video_input_change,
-        inputs=[video_input_component],
-        outputs=[video_prompter_input_component]
-    )
-    submit_image_button_component.click(
-        fn=process_image,
         inputs=[
             checkpoint_dropdown_component,
             mode_dropdown_component,
@@ -252,15 +146,5 @@ with gr.Blocks() as demo:
         ],
         outputs=[image_output_component]
     )
-    submit_video_button_component.click(
-        fn=process_video,
-        inputs=[
-            checkpoint_dropdown_component,
-            mode_dropdown_component,
-            video_input_component,
-            video_prompter_input_component,
-        ],
-        outputs=[video_output_component]
-    )
 demo.launch(debug=False, show_error=True, max_threads=1)

 from typing import Optional
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
 from gradio_image_prompter import ImagePrompter
 from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
+    MASK_GENERATION_MODE, BOX_PROMPT_MODE
 MARKDOWN = """
 # Segment Anything Model 2 🔥
 visual segmentation in both images and videos. **Video segmentation will be available
 soon.**
 """
 EXAMPLES = [
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
 IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)
+def process(
     checkpoint_dropdown,
     mode_dropdown,
     image_input,
         return MASK_ANNOTATOR.annotate(image_input, detections)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
             label="Mode",
             info="Select a mode to use. `box prompt` if you want to generate masks for "
                  "selected objects, `mask generation` if you want to generate masks "
+                 "for the whole image.",
             interactive=True
         )
     with gr.Row():
             image_input_component = gr.Image(
                 type='pil', label='Upload image', visible=False)
             image_prompter_input_component = ImagePrompter(
+                type='pil', label='Image prompt')
+            submit_button_component = gr.Button(
                 value='Submit', variant='primary')
         with gr.Column():
+            image_output_component = gr.Image(type='pil', label='Image Output')
     with gr.Row():
         gr.Examples(
+            fn=process,
             examples=EXAMPLES,
             inputs=[
                 checkpoint_dropdown_component,
             run_on_click=True
         )
+    def on_mode_dropdown_change(text):
+        return [
+            gr.Image(visible=text == MASK_GENERATION_MODE),
+            ImagePrompter(visible=text == BOX_PROMPT_MODE)
+        ]
     mode_dropdown_component.change(
         on_mode_dropdown_change,
         inputs=[mode_dropdown_component],
         outputs=[
             image_input_component,
+            image_prompter_input_component
         ]
     )
+    submit_button_component.click(
+        fn=process,
         inputs=[
             checkpoint_dropdown_component,
             mode_dropdown_component,
         ],
         outputs=[image_output_component]
     )
 demo.launch(debug=False, show_error=True, max_threads=1)

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-tqdm
 samv2
 gradio
 supervision

 samv2
 gradio
 supervision

utils/models.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 BOX_PROMPT_MODE = "box prompt"
 MASK_GENERATION_MODE = "mask generation"
 VIDEO_SEGMENTATION_MODE = "video segmentation"
-MODE_NAMES = [BOX_PROMPT_MODE, MASK_GENERATION_MODE, VIDEO_SEGMENTATION_MODE]
 CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {

 BOX_PROMPT_MODE = "box prompt"
 MASK_GENERATION_MODE = "mask generation"
 VIDEO_SEGMENTATION_MODE = "video segmentation"
+MODE_NAMES = [BOX_PROMPT_MODE, MASK_GENERATION_MODE]
 CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {

utils/video.py DELETED Viewed

@@ -1,14 +0,0 @@
-import os
-import uuid
-import datetime
-def create_directory(directory_path: str) -> None:
-    if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
-def generate_unique_name():
-    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-    unique_id = uuid.uuid4()
-    return f"{current_datetime}_{unique_id}"