Spaces:
Runtime error
Runtime error
# Copyright (c) OpenMMLab. All rights reserved. | |
from typing import List, Tuple | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer | |
from mmcv.ops.carafe import CARAFEPack | |
from mmengine.config import ConfigDict | |
from mmengine.model import BaseModule, ModuleList | |
from mmengine.structures import InstanceData | |
from torch import Tensor | |
from torch.nn.modules.utils import _pair | |
from mmdet.models.task_modules.samplers import SamplingResult | |
from mmdet.models.utils import empty_instances | |
from mmdet.registry import MODELS | |
from mmdet.structures.mask import mask_target | |
from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig | |
BYTES_PER_FLOAT = 4 | |
# TODO: This memory limit may be too much or too little. It would be better to | |
# determine it based on available resources. | |
GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit | |
class FCNMaskHead(BaseModule): | |
def __init__(self, | |
num_convs: int = 4, | |
roi_feat_size: int = 14, | |
in_channels: int = 256, | |
conv_kernel_size: int = 3, | |
conv_out_channels: int = 256, | |
num_classes: int = 80, | |
class_agnostic: int = False, | |
upsample_cfg: ConfigType = dict( | |
type='deconv', scale_factor=2), | |
conv_cfg: OptConfigType = None, | |
norm_cfg: OptConfigType = None, | |
predictor_cfg: ConfigType = dict(type='Conv'), | |
loss_mask: ConfigType = dict( | |
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), | |
init_cfg: OptMultiConfig = None) -> None: | |
assert init_cfg is None, 'To prevent abnormal initialization ' \ | |
'behavior, init_cfg is not allowed to be set' | |
super().__init__(init_cfg=init_cfg) | |
self.upsample_cfg = upsample_cfg.copy() | |
if self.upsample_cfg['type'] not in [ | |
None, 'deconv', 'nearest', 'bilinear', 'carafe' | |
]: | |
raise ValueError( | |
f'Invalid upsample method {self.upsample_cfg["type"]}, ' | |
'accepted methods are "deconv", "nearest", "bilinear", ' | |
'"carafe"') | |
self.num_convs = num_convs | |
# WARN: roi_feat_size is reserved and not used | |
self.roi_feat_size = _pair(roi_feat_size) | |
self.in_channels = in_channels | |
self.conv_kernel_size = conv_kernel_size | |
self.conv_out_channels = conv_out_channels | |
self.upsample_method = self.upsample_cfg.get('type') | |
self.scale_factor = self.upsample_cfg.pop('scale_factor', None) | |
self.num_classes = num_classes | |
self.class_agnostic = class_agnostic | |
self.conv_cfg = conv_cfg | |
self.norm_cfg = norm_cfg | |
self.predictor_cfg = predictor_cfg | |
self.loss_mask = MODELS.build(loss_mask) | |
self.convs = ModuleList() | |
for i in range(self.num_convs): | |
in_channels = ( | |
self.in_channels if i == 0 else self.conv_out_channels) | |
padding = (self.conv_kernel_size - 1) // 2 | |
self.convs.append( | |
ConvModule( | |
in_channels, | |
self.conv_out_channels, | |
self.conv_kernel_size, | |
padding=padding, | |
conv_cfg=conv_cfg, | |
norm_cfg=norm_cfg)) | |
upsample_in_channels = ( | |
self.conv_out_channels if self.num_convs > 0 else in_channels) | |
upsample_cfg_ = self.upsample_cfg.copy() | |
if self.upsample_method is None: | |
self.upsample = None | |
elif self.upsample_method == 'deconv': | |
upsample_cfg_.update( | |
in_channels=upsample_in_channels, | |
out_channels=self.conv_out_channels, | |
kernel_size=self.scale_factor, | |
stride=self.scale_factor) | |
self.upsample = build_upsample_layer(upsample_cfg_) | |
elif self.upsample_method == 'carafe': | |
upsample_cfg_.update( | |
channels=upsample_in_channels, scale_factor=self.scale_factor) | |
self.upsample = build_upsample_layer(upsample_cfg_) | |
else: | |
# suppress warnings | |
align_corners = (None | |
if self.upsample_method == 'nearest' else False) | |
upsample_cfg_.update( | |
scale_factor=self.scale_factor, | |
mode=self.upsample_method, | |
align_corners=align_corners) | |
self.upsample = build_upsample_layer(upsample_cfg_) | |
out_channels = 1 if self.class_agnostic else self.num_classes | |
logits_in_channel = ( | |
self.conv_out_channels | |
if self.upsample_method == 'deconv' else upsample_in_channels) | |
self.conv_logits = build_conv_layer(self.predictor_cfg, | |
logits_in_channel, out_channels, 1) | |
self.relu = nn.ReLU(inplace=True) | |
self.debug_imgs = None | |
def init_weights(self) -> None: | |
"""Initialize the weights.""" | |
super().init_weights() | |
for m in [self.upsample, self.conv_logits]: | |
if m is None: | |
continue | |
elif isinstance(m, CARAFEPack): | |
m.init_weights() | |
elif hasattr(m, 'weight') and hasattr(m, 'bias'): | |
nn.init.kaiming_normal_( | |
m.weight, mode='fan_out', nonlinearity='relu') | |
nn.init.constant_(m.bias, 0) | |
def forward(self, x: Tensor) -> Tensor: | |
"""Forward features from the upstream network. | |
Args: | |
x (Tensor): Extract mask RoI features. | |
Returns: | |
Tensor: Predicted foreground masks. | |
""" | |
for conv in self.convs: | |
x = conv(x) | |
if self.upsample is not None: | |
x = self.upsample(x) | |
if self.upsample_method == 'deconv': | |
x = self.relu(x) | |
mask_preds = self.conv_logits(x) | |
return mask_preds | |
def get_targets(self, sampling_results: List[SamplingResult], | |
batch_gt_instances: InstanceList, | |
rcnn_train_cfg: ConfigDict) -> Tensor: | |
"""Calculate the ground truth for all samples in a batch according to | |
the sampling_results. | |
Args: | |
sampling_results (List[obj:SamplingResult]): Assign results of | |
all images in a batch after sampling. | |
batch_gt_instances (list[:obj:`InstanceData`]): Batch of | |
gt_instance. It usually includes ``bboxes``, ``labels``, and | |
``masks`` attributes. | |
rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. | |
Returns: | |
Tensor: Mask target of each positive proposals in the image. | |
""" | |
pos_proposals = [res.pos_priors for res in sampling_results] | |
pos_assigned_gt_inds = [ | |
res.pos_assigned_gt_inds for res in sampling_results | |
] | |
gt_masks = [res.masks for res in batch_gt_instances] | |
mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds, | |
gt_masks, rcnn_train_cfg) | |
return mask_targets | |
def loss_and_target(self, mask_preds: Tensor, | |
sampling_results: List[SamplingResult], | |
batch_gt_instances: InstanceList, | |
rcnn_train_cfg: ConfigDict) -> dict: | |
"""Calculate the loss based on the features extracted by the mask head. | |
Args: | |
mask_preds (Tensor): Predicted foreground masks, has shape | |
(num_pos, num_classes, h, w). | |
sampling_results (List[obj:SamplingResult]): Assign results of | |
all images in a batch after sampling. | |
batch_gt_instances (list[:obj:`InstanceData`]): Batch of | |
gt_instance. It usually includes ``bboxes``, ``labels``, and | |
``masks`` attributes. | |
rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. | |
Returns: | |
dict: A dictionary of loss and targets components. | |
""" | |
mask_targets = self.get_targets( | |
sampling_results=sampling_results, | |
batch_gt_instances=batch_gt_instances, | |
rcnn_train_cfg=rcnn_train_cfg) | |
pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) | |
loss = dict() | |
if mask_preds.size(0) == 0: | |
loss_mask = mask_preds.sum() | |
else: | |
if self.class_agnostic: | |
loss_mask = self.loss_mask(mask_preds, mask_targets, | |
torch.zeros_like(pos_labels)) | |
else: | |
loss_mask = self.loss_mask(mask_preds, mask_targets, | |
pos_labels) | |
loss['loss_mask'] = loss_mask | |
# TODO: which algorithm requires mask_targets? | |
return dict(loss_mask=loss, mask_targets=mask_targets) | |
def predict_by_feat(self, | |
mask_preds: Tuple[Tensor], | |
results_list: List[InstanceData], | |
batch_img_metas: List[dict], | |
rcnn_test_cfg: ConfigDict, | |
rescale: bool = False, | |
activate_map: bool = False) -> InstanceList: | |
"""Transform a batch of output features extracted from the head into | |
mask results. | |
Args: | |
mask_preds (tuple[Tensor]): Tuple of predicted foreground masks, | |
each has shape (n, num_classes, h, w). | |
results_list (list[:obj:`InstanceData`]): Detection results of | |
each image. | |
batch_img_metas (list[dict]): List of image information. | |
rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. | |
rescale (bool): If True, return boxes in original image space. | |
Defaults to False. | |
activate_map (book): Whether get results with augmentations test. | |
If True, the `mask_preds` will not process with sigmoid. | |
Defaults to False. | |
Returns: | |
list[:obj:`InstanceData`]: Detection results of each image | |
after the post process. Each item usually contains following keys. | |
- scores (Tensor): Classification scores, has a shape | |
(num_instance, ) | |
- labels (Tensor): Labels of bboxes, has a shape | |
(num_instances, ). | |
- bboxes (Tensor): Has a shape (num_instances, 4), | |
the last dimension 4 arrange as (x1, y1, x2, y2). | |
- masks (Tensor): Has a shape (num_instances, H, W). | |
""" | |
assert len(mask_preds) == len(results_list) == len(batch_img_metas) | |
for img_id in range(len(batch_img_metas)): | |
img_meta = batch_img_metas[img_id] | |
results = results_list[img_id] | |
bboxes = results.bboxes | |
if bboxes.shape[0] == 0: | |
results_list[img_id] = empty_instances( | |
[img_meta], | |
bboxes.device, | |
task_type='mask', | |
instance_results=[results], | |
mask_thr_binary=rcnn_test_cfg.mask_thr_binary)[0] | |
else: | |
im_mask = self._predict_by_feat_single( | |
mask_preds=mask_preds[img_id], | |
bboxes=bboxes, | |
labels=results.labels, | |
img_meta=img_meta, | |
rcnn_test_cfg=rcnn_test_cfg, | |
rescale=rescale, | |
activate_map=activate_map) | |
results.masks = im_mask | |
return results_list | |
def _predict_by_feat_single(self, | |
mask_preds: Tensor, | |
bboxes: Tensor, | |
labels: Tensor, | |
img_meta: dict, | |
rcnn_test_cfg: ConfigDict, | |
rescale: bool = False, | |
activate_map: bool = False) -> Tensor: | |
"""Get segmentation masks from mask_preds and bboxes. | |
Args: | |
mask_preds (Tensor): Predicted foreground masks, has shape | |
(n, num_classes, h, w). | |
bboxes (Tensor): Predicted bboxes, has shape (n, 4) | |
labels (Tensor): Labels of bboxes, has shape (n, ) | |
img_meta (dict): image information. | |
rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. | |
Defaults to None. | |
rescale (bool): If True, return boxes in original image space. | |
Defaults to False. | |
activate_map (book): Whether get results with augmentations test. | |
If True, the `mask_preds` will not process with sigmoid. | |
Defaults to False. | |
Returns: | |
Tensor: Encoded masks, has shape (n, img_w, img_h) | |
Example: | |
>>> from mmengine.config import Config | |
>>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import * # NOQA | |
>>> N = 7 # N = number of extracted ROIs | |
>>> C, H, W = 11, 32, 32 | |
>>> # Create example instance of FCN Mask Head. | |
>>> self = FCNMaskHead(num_classes=C, num_convs=0) | |
>>> inputs = torch.rand(N, self.in_channels, H, W) | |
>>> mask_preds = self.forward(inputs) | |
>>> # Each input is associated with some bounding box | |
>>> bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N) | |
>>> labels = torch.randint(0, C, size=(N,)) | |
>>> rcnn_test_cfg = Config({'mask_thr_binary': 0, }) | |
>>> ori_shape = (H * 4, W * 4) | |
>>> scale_factor = (1, 1) | |
>>> rescale = False | |
>>> img_meta = {'scale_factor': scale_factor, | |
... 'ori_shape': ori_shape} | |
>>> # Encoded masks are a list for each category. | |
>>> encoded_masks = self._get_seg_masks_single( | |
... mask_preds, bboxes, labels, | |
... img_meta, rcnn_test_cfg, rescale) | |
>>> assert encoded_masks.size()[0] == N | |
>>> assert encoded_masks.size()[1:] == ori_shape | |
""" | |
scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( | |
(1, 2)) | |
img_h, img_w = img_meta['ori_shape'][:2] | |
device = bboxes.device | |
if not activate_map: | |
mask_preds = mask_preds.sigmoid() | |
else: | |
# In AugTest, has been activated before | |
mask_preds = bboxes.new_tensor(mask_preds) | |
if rescale: # in-placed rescale the bboxes | |
bboxes /= scale_factor | |
else: | |
w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1] | |
img_h = np.round(img_h * h_scale.item()).astype(np.int32) | |
img_w = np.round(img_w * w_scale.item()).astype(np.int32) | |
N = len(mask_preds) | |
# The actual implementation split the input into chunks, | |
# and paste them chunk by chunk. | |
if device.type == 'cpu': | |
# CPU is most efficient when they are pasted one by one with | |
# skip_empty=True, so that it performs minimal number of | |
# operations. | |
num_chunks = N | |
else: | |
# GPU benefits from parallelism for larger chunks, | |
# but may have memory issue | |
# the types of img_w and img_h are np.int32, | |
# when the image resolution is large, | |
# the calculation of num_chunks will overflow. | |
# so we need to change the types of img_w and img_h to int. | |
# See https://github.com/open-mmlab/mmdetection/pull/5191 | |
num_chunks = int( | |
np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / | |
GPU_MEM_LIMIT)) | |
assert (num_chunks <= | |
N), 'Default GPU_MEM_LIMIT is too small; try increasing it' | |
chunks = torch.chunk(torch.arange(N, device=device), num_chunks) | |
threshold = rcnn_test_cfg.mask_thr_binary | |
im_mask = torch.zeros( | |
N, | |
img_h, | |
img_w, | |
device=device, | |
dtype=torch.bool if threshold >= 0 else torch.uint8) | |
if not self.class_agnostic: | |
mask_preds = mask_preds[range(N), labels][:, None] | |
for inds in chunks: | |
masks_chunk, spatial_inds = _do_paste_mask( | |
mask_preds[inds], | |
bboxes[inds], | |
img_h, | |
img_w, | |
skip_empty=device.type == 'cpu') | |
if threshold >= 0: | |
masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) | |
else: | |
# for visualization and debugging | |
masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) | |
im_mask[(inds, ) + spatial_inds] = masks_chunk | |
return im_mask | |
def _do_paste_mask(masks: Tensor, | |
boxes: Tensor, | |
img_h: int, | |
img_w: int, | |
skip_empty: bool = True) -> tuple: | |
"""Paste instance masks according to boxes. | |
This implementation is modified from | |
https://github.com/facebookresearch/detectron2/ | |
Args: | |
masks (Tensor): N, 1, H, W | |
boxes (Tensor): N, 4 | |
img_h (int): Height of the image to be pasted. | |
img_w (int): Width of the image to be pasted. | |
skip_empty (bool): Only paste masks within the region that | |
tightly bound all boxes, and returns the results this region only. | |
An important optimization for CPU. | |
Returns: | |
tuple: (Tensor, tuple). The first item is mask tensor, the second one | |
is the slice object. | |
If skip_empty == False, the whole image will be pasted. It will | |
return a mask of shape (N, img_h, img_w) and an empty tuple. | |
If skip_empty == True, only area around the mask will be pasted. | |
A mask of shape (N, h', w') and its start and end coordinates | |
in the original image will be returned. | |
""" | |
# On GPU, paste all masks together (up to chunk size) | |
# by using the entire image to sample the masks | |
# Compared to pasting them one by one, | |
# this has more operations but is faster on COCO-scale dataset. | |
device = masks.device | |
if skip_empty: | |
x0_int, y0_int = torch.clamp( | |
boxes.min(dim=0).values.floor()[:2] - 1, | |
min=0).to(dtype=torch.int32) | |
x1_int = torch.clamp( | |
boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32) | |
y1_int = torch.clamp( | |
boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32) | |
else: | |
x0_int, y0_int = 0, 0 | |
x1_int, y1_int = img_w, img_h | |
x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1 | |
N = masks.shape[0] | |
img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5 | |
img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5 | |
img_y = (img_y - y0) / (y1 - y0) * 2 - 1 | |
img_x = (img_x - x0) / (x1 - x0) * 2 - 1 | |
# img_x, img_y have shapes (N, w), (N, h) | |
# IsInf op is not supported with ONNX<=1.7.0 | |
if not torch.onnx.is_in_onnx_export(): | |
if torch.isinf(img_x).any(): | |
inds = torch.where(torch.isinf(img_x)) | |
img_x[inds] = 0 | |
if torch.isinf(img_y).any(): | |
inds = torch.where(torch.isinf(img_y)) | |
img_y[inds] = 0 | |
gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1)) | |
gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) | |
grid = torch.stack([gx, gy], dim=3) | |
img_masks = F.grid_sample( | |
masks.to(dtype=torch.float32), grid, align_corners=False) | |
if skip_empty: | |
return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int)) | |
else: | |
return img_masks[:, 0], () | |