data_mining / tool /segmentor.py
aikenml's picture
Upload folder using huggingface_hub
92bea3e
import torch
import cv2
import numpy as np
from sam.segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
class Segmentor:
def __init__(self, sam_args):
"""
sam_args:
sam_checkpoint: path of SAM checkpoint
generator_args: args for everything_generator
gpu_id: device
"""
self.device = sam_args["gpu_id"]
self.sam = sam_model_registry[sam_args["model_type"]](checkpoint=sam_args["sam_checkpoint"])
self.sam.to(device=self.device)
self.everything_generator = SamAutomaticMaskGenerator(model=self.sam, **sam_args['generator_args'])
self.interactive_predictor = self.everything_generator.predictor
self.have_embedded = False
@torch.no_grad()
def set_image(self, image):
# calculate the embedding only once per frame.
if not self.have_embedded:
self.interactive_predictor.set_image(image)
self.have_embedded = True
@torch.no_grad()
def interactive_predict(self, prompts, mode, multimask=True):
assert self.have_embedded, 'image embedding for sam need be set before predict.'
if mode == 'point':
masks, scores, logits = self.interactive_predictor.predict(point_coords=prompts['point_coords'],
point_labels=prompts['point_modes'],
multimask_output=multimask)
elif mode == 'mask':
masks, scores, logits = self.interactive_predictor.predict(mask_input=prompts['mask_prompt'],
multimask_output=multimask)
elif mode == 'point_mask':
masks, scores, logits = self.interactive_predictor.predict(point_coords=prompts['point_coords'],
point_labels=prompts['point_modes'],
mask_input=prompts['mask_prompt'],
multimask_output=multimask)
return masks, scores, logits
@torch.no_grad()
def segment_with_click(self, origin_frame, coords, modes, multimask=True):
'''
return:
mask: one-hot
'''
self.set_image(origin_frame)
prompts = {
'point_coords': coords,
'point_modes': modes,
}
masks, scores, logits = self.interactive_predict(prompts, 'point', multimask)
mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
prompts = {
'point_coords': coords,
'point_modes': modes,
'mask_prompt': logit[None, :, :]
}
masks, scores, logits = self.interactive_predict(prompts, 'point_mask', multimask)
mask = masks[np.argmax(scores)]
return mask.astype(np.uint8)
def segment_with_box(self, origin_frame, bbox, reset_image=False):
if reset_image:
self.interactive_predictor.set_image(origin_frame)
else:
self.set_image(origin_frame)
# coord = np.array([[int((bbox[1][0] - bbox[0][0]) / 2.), int((bbox[1][1] - bbox[0][1]) / 2)]])
# point_label = np.array([1])
masks, scores, logits = self.interactive_predictor.predict(
point_coords=None,
point_labels=None,
box=np.array([bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]),
multimask_output=True
)
mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
masks, scores, logits = self.interactive_predictor.predict(
point_coords=None,
point_labels=None,
box=np.array([[bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]]),
mask_input=logit[None, :, :],
multimask_output=True
)
mask = masks[np.argmax(scores)]
return [mask]