segment-anything-model-2

Paused

App Files Files Community

SkalskiP commited on Jul 31, 2024

Commit

aabd771

1 Parent(s): 16d828f

working on video inference

Browse files

Files changed (4) hide show

app.py +133 -17
requirements.txt +1 -0
utils/models.py +1 -1
utils/video.py +14 -0

app.py CHANGED Viewed

@@ -1,14 +1,19 @@
 from typing import Optional
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
 from gradio_image_prompter import ImagePrompter
 from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
-    MASK_GENERATION_MODE, BOX_PROMPT_MODE
 MARKDOWN = """
 # Segment Anything Model 2 🔥
@@ -31,6 +36,7 @@ Segment Anything Model 2 (SAM 2) is a foundation model designed to address promp
 visual segmentation in both images and videos. **Video segmentation will be available
 soon.**
 """
 EXAMPLES = [
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
@@ -41,8 +47,37 @@ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
 IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)
-def process(
     checkpoint_dropdown,
     mode_dropdown,
     image_input,
@@ -79,6 +114,64 @@ def process(
         return MASK_ANNOTATOR.annotate(image_input, detections)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
@@ -94,7 +187,8 @@ with gr.Blocks() as demo:
             label="Mode",
             info="Select a mode to use. `box prompt` if you want to generate masks for "
                  "selected objects, `mask generation` if you want to generate masks "
-                 "for the whole image.",
             interactive=True
         )
     with gr.Row():
@@ -102,14 +196,22 @@ with gr.Blocks() as demo:
             image_input_component = gr.Image(
                 type='pil', label='Upload image', visible=False)
             image_prompter_input_component = ImagePrompter(
-                type='pil', label='Image prompt')
-            submit_button_component = gr.Button(
                 value='Submit', variant='primary')
         with gr.Column():
-            image_output_component = gr.Image(type='pil', label='Image Output')
     with gr.Row():
         gr.Examples(
-            fn=process,
             examples=EXAMPLES,
             inputs=[
                 checkpoint_dropdown_component,
@@ -121,23 +223,27 @@ with gr.Blocks() as demo:
             run_on_click=True
         )
-    def on_mode_dropdown_change(text):
-        return [
-            gr.Image(visible=text == MASK_GENERATION_MODE),
-            ImagePrompter(visible=text == BOX_PROMPT_MODE)
-        ]
     mode_dropdown_component.change(
         on_mode_dropdown_change,
         inputs=[mode_dropdown_component],
         outputs=[
             image_input_component,
-            image_prompter_input_component
         ]
     )
-    submit_button_component.click(
-        fn=process,
         inputs=[
             checkpoint_dropdown_component,
             mode_dropdown_component,
@@ -146,5 +252,15 @@ with gr.Blocks() as demo:
         ],
         outputs=[image_output_component]
     )
 demo.launch(debug=False, show_error=True, max_threads=1)

+import os
 from typing import Optional
+import cv2
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
+from tqdm import tqdm
 from gradio_image_prompter import ImagePrompter
 from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
+    MASK_GENERATION_MODE, BOX_PROMPT_MODE, VIDEO_SEGMENTATION_MODE
+from utils.video import create_directory, generate_unique_name
+from sam2.build_sam import build_sam2_video_predictor
 MARKDOWN = """
 # Segment Anything Model 2 🔥
 visual segmentation in both images and videos. **Video segmentation will be available
 soon.**
 """
 EXAMPLES = [
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
 MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
 IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)
+SCALE_FACTOR = 0.5
+TARGET_DIRECTORY = "tmp"
+# creating video results directory
+create_directory(directory_path=TARGET_DIRECTORY)
+def on_mode_dropdown_change(text):
+    return [
+        gr.Image(visible=text == MASK_GENERATION_MODE),
+        ImagePrompter(visible=text == BOX_PROMPT_MODE),
+        gr.Video(visible=text == VIDEO_SEGMENTATION_MODE),
+        ImagePrompter(visible=text == VIDEO_SEGMENTATION_MODE),
+        gr.Button(visible=text != VIDEO_SEGMENTATION_MODE),
+        gr.Button(visible=text == VIDEO_SEGMENTATION_MODE),
+        gr.Image(visible=text != VIDEO_SEGMENTATION_MODE),
+        gr.Video(visible=text == VIDEO_SEGMENTATION_MODE)
+    ]
+def on_video_input_change(video_input):
+    if not video_input:
+        return None
+    frames_generator = sv.get_video_frames_generator(video_input)
+    frame = next(frames_generator)
+    frame = sv.scale_image(frame, SCALE_FACTOR)
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    frame = Image.fromarray(frame)
+    return {'image': frame, 'points': []}
+def process_image(
     checkpoint_dropdown,
     mode_dropdown,
     image_input,
         return MASK_ANNOTATOR.annotate(image_input, detections)
+def process_video(
+    checkpoint_dropdown,
+    mode_dropdown,
+    video_input,
+    video_prompter_input,
+    progress=gr.Progress(track_tqdm=True)
+) -> str:
+    if mode_dropdown != VIDEO_SEGMENTATION_MODE:
+        return str(video_input)
+    name = generate_unique_name()
+    frame_directory_path = os.path.join(TARGET_DIRECTORY, name)
+    frames_sink = sv.ImageSink(
+        target_dir_path=frame_directory_path,
+        image_name_pattern="{:05d}.jpeg"
+    )
+    video_info = sv.VideoInfo.from_video_path(video_input)
+    frames_generator = sv.get_video_frames_generator(video_input)
+    with frames_sink:
+        for frame in tqdm(
+                frames_generator,
+                total=video_info.total_frames,
+                desc="splitting video into frames"
+        ):
+            frame = sv.scale_image(frame, SCALE_FACTOR)
+            frames_sink.save_image(frame)
+    model = build_sam2_video_predictor(
+        "sam2_hiera_t.yaml",
+        "checkpoints/sam2_hiera_tiny.pt",
+        device=DEVICE
+    )
+    inference_state = model.init_state(
+        video_path=frame_directory_path,
+        offload_video_to_cpu=DEVICE == torch.device('cpu'),
+        offload_state_to_cpu=DEVICE == torch.device('cpu'),
+    )
+    prompt = video_prompter_input["points"]
+    points = np.array([[x1, y1] for x1, y1, _, _, _, _ in prompt])
+    labels = np.ones(len(points))
+    _, object_ids, mask_logits = model.add_new_points(
+        inference_state=inference_state,
+        frame_idx=0,
+        obj_id=1,
+        points=points,
+        labels=labels,
+    )
+    del inference_state
+    del model
+    video_path = os.path.join(TARGET_DIRECTORY, f"{name}.mp4")
+    return str(video_input)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
             label="Mode",
             info="Select a mode to use. `box prompt` if you want to generate masks for "
                  "selected objects, `mask generation` if you want to generate masks "
+                 "for the whole image, and `video segmentation` if you want to track "
+                 "object on video.",
             interactive=True
         )
     with gr.Row():
             image_input_component = gr.Image(
                 type='pil', label='Upload image', visible=False)
             image_prompter_input_component = ImagePrompter(
+                type='pil', label='Prompt image')
+            video_input_component = gr.Video(
+                label='Step 1: Upload video', visible=False)
+            video_prompter_input_component = ImagePrompter(
+                type='pil', label='Step 2: Prompt frame', visible=False)
+            submit_image_button_component = gr.Button(
                 value='Submit', variant='primary')
+            submit_video_button_component = gr.Button(
+                value='Submit', variant='primary', visible=False)
         with gr.Column():
+            image_output_component = gr.Image(type='pil', label='Image output')
+            video_output_component = gr.Video(
+                label='Step 2: Video output', visible=False)
     with gr.Row():
         gr.Examples(
+            fn=process_image,
             examples=EXAMPLES,
             inputs=[
                 checkpoint_dropdown_component,
             run_on_click=True
         )
     mode_dropdown_component.change(
         on_mode_dropdown_change,
         inputs=[mode_dropdown_component],
         outputs=[
             image_input_component,
+            image_prompter_input_component,
+            video_input_component,
+            video_prompter_input_component,
+            submit_image_button_component,
+            submit_video_button_component,
+            image_output_component,
+            video_output_component
         ]
     )
+    video_input_component.change(
+        fn=on_video_input_change,
+        inputs=[video_input_component],
+        outputs=[video_prompter_input_component]
+    )
+    submit_image_button_component.click(
+        fn=process_image,
         inputs=[
             checkpoint_dropdown_component,
             mode_dropdown_component,
         ],
         outputs=[image_output_component]
     )
+    submit_video_button_component.click(
+        fn=process_video,
+        inputs=[
+            checkpoint_dropdown_component,
+            mode_dropdown_component,
+            video_input_component,
+            video_prompter_input_component,
+        ],
+        outputs=[video_output_component]
+    )
 demo.launch(debug=False, show_error=True, max_threads=1)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 samv2
 gradio
 supervision

+tqdm
 samv2
 gradio
 supervision

utils/models.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 BOX_PROMPT_MODE = "box prompt"
 MASK_GENERATION_MODE = "mask generation"
 VIDEO_SEGMENTATION_MODE = "video segmentation"
-MODE_NAMES = [BOX_PROMPT_MODE, MASK_GENERATION_MODE]
 CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {

 BOX_PROMPT_MODE = "box prompt"
 MASK_GENERATION_MODE = "mask generation"
 VIDEO_SEGMENTATION_MODE = "video segmentation"
+MODE_NAMES = [BOX_PROMPT_MODE, MASK_GENERATION_MODE, VIDEO_SEGMENTATION_MODE]
 CHECKPOINT_NAMES = ["tiny", "small", "base_plus", "large"]
 CHECKPOINTS = {

utils/video.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+import uuid
+import datetime
+def create_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+def generate_unique_name():
+    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    unique_id = uuid.uuid4()
+    return f"{current_datetime}_{unique_id}"