KyanChen's picture
init
f549064
raw
history blame
21.4 kB
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Optional, Tuple
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule
from mmcv.ops import DeformConv2d
from mmengine.config import ConfigDict
from mmengine.model import BaseModule
from mmengine.structures import InstanceData
from torch import Tensor
from mmdet.registry import MODELS
from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
from ..utils import filter_scores_and_topk, multi_apply
from .anchor_free_head import AnchorFreeHead
INF = 1e8
class FeatureAlign(BaseModule):
"""Feature Align Module.
Feature Align Module is implemented based on DCN v1.
It uses anchor shape prediction rather than feature map to
predict offsets of deform conv layer.
Args:
in_channels (int): Number of channels in the input feature map.
out_channels (int): Number of channels in the output feature map.
kernel_size (int): Size of the convolution kernel.
``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
deform_groups: (int): Group number of DCN in
FeatureAdaption module.
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
dict], optional): Initialization config dict.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
deform_groups: int = 4,
init_cfg: OptMultiConfig = dict(
type='Normal',
layer='Conv2d',
std=0.1,
override=dict(type='Normal', name='conv_adaption', std=0.01))
) -> None:
super().__init__(init_cfg=init_cfg)
offset_channels = kernel_size * kernel_size * 2
self.conv_offset = nn.Conv2d(
4, deform_groups * offset_channels, 1, bias=False)
self.conv_adaption = DeformConv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
deform_groups=deform_groups)
self.relu = nn.ReLU(inplace=True)
def forward(self, x: Tensor, shape: Tensor) -> Tensor:
"""Forward function of feature align module.
Args:
x (Tensor): Features from the upstream network.
shape (Tensor): Exponential of bbox predictions.
Returns:
x (Tensor): The aligned features.
"""
offset = self.conv_offset(shape)
x = self.relu(self.conv_adaption(x, offset))
return x
@MODELS.register_module()
class FoveaHead(AnchorFreeHead):
"""Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.
<https://arxiv.org/abs/1904.03797>`_.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
base_edge_list (list[int]): List of edges.
scale_ranges (list[tuple]): Range of scales.
sigma (float): Super parameter of ``FoveaHead``.
with_deform (bool): Whether use deform conv.
deform_groups (int): Deformable conv group size.
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
dict], optional): Initialization config dict.
"""
def __init__(self,
num_classes: int,
in_channels: int,
base_edge_list: List[int] = (16, 32, 64, 128, 256),
scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
(64, 256), (128, 512)),
sigma: float = 0.4,
with_deform: bool = False,
deform_groups: int = 4,
init_cfg: OptMultiConfig = dict(
type='Normal',
layer='Conv2d',
std=0.01,
override=dict(
type='Normal',
name='conv_cls',
std=0.01,
bias_prob=0.01)),
**kwargs) -> None:
self.base_edge_list = base_edge_list
self.scale_ranges = scale_ranges
self.sigma = sigma
self.with_deform = with_deform
self.deform_groups = deform_groups
super().__init__(
num_classes=num_classes,
in_channels=in_channels,
init_cfg=init_cfg,
**kwargs)
def _init_layers(self) -> None:
"""Initialize layers of the head."""
# box branch
super()._init_reg_convs()
self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
# cls branch
if not self.with_deform:
super()._init_cls_convs()
self.conv_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
else:
self.cls_convs = nn.ModuleList()
self.cls_convs.append(
ConvModule(
self.feat_channels, (self.feat_channels * 4),
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.cls_convs.append(
ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
1,
stride=1,
padding=0,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.feature_adaption = FeatureAlign(
self.feat_channels,
self.feat_channels,
kernel_size=3,
deform_groups=self.deform_groups)
self.conv_cls = nn.Conv2d(
int(self.feat_channels * 4),
self.cls_out_channels,
3,
padding=1)
def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
"""Forward features of a single scale level.
Args:
x (Tensor): FPN feature maps of the specified stride.
Returns:
tuple: scores for each class and bbox predictions of input
feature maps.
"""
cls_feat = x
reg_feat = x
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
bbox_pred = self.conv_reg(reg_feat)
if self.with_deform:
cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
cls_score = self.conv_cls(cls_feat)
return cls_score, bbox_pred
def loss_by_feat(
self,
cls_scores: List[Tensor],
bbox_preds: List[Tensor],
batch_gt_instances: InstanceList,
batch_img_metas: List[dict],
batch_gt_instances_ignore: OptInstanceList = None
) -> Dict[str, Tensor]:
"""Calculate the loss based on the features extracted by the detection
head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_priors * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_priors * 4.
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
priors = self.prior_generator.grid_priors(
featmap_sizes,
dtype=bbox_preds[0].dtype,
device=bbox_preds[0].device)
num_imgs = cls_scores[0].size(0)
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
for bbox_pred in bbox_preds
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_labels, flatten_bbox_targets = self.get_targets(
batch_gt_instances, featmap_sizes, priors)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds = ((flatten_labels >= 0)
& (flatten_labels < self.num_classes)).nonzero().view(-1)
num_pos = len(pos_inds)
loss_cls = self.loss_cls(
flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
if num_pos > 0:
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_bbox_targets = flatten_bbox_targets[pos_inds]
pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
loss_bbox = self.loss_bbox(
pos_bbox_preds,
pos_bbox_targets,
pos_weights,
avg_factor=num_pos)
else:
loss_bbox = torch.tensor(
0,
dtype=flatten_bbox_preds.dtype,
device=flatten_bbox_preds.device)
return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
def get_targets(
self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
"""Compute regression and classification for priors in multiple images.
Args:
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
featmap_sizes (list[tuple]): Size tuple of feature maps.
priors_list (list[Tensor]): Priors list of each fpn level, each has
shape (num_priors, 2).
Returns:
tuple: Targets of each level.
- flatten_labels (list[Tensor]): Labels of each level.
- flatten_bbox_targets (list[Tensor]): BBox targets of each
level.
"""
label_list, bbox_target_list = multi_apply(
self._get_targets_single,
batch_gt_instances,
featmap_size_list=featmap_sizes,
priors_list=priors_list)
flatten_labels = [
torch.cat([
labels_level_img.flatten() for labels_level_img in labels_level
]) for labels_level in zip(*label_list)
]
flatten_bbox_targets = [
torch.cat([
bbox_targets_level_img.reshape(-1, 4)
for bbox_targets_level_img in bbox_targets_level
]) for bbox_targets_level in zip(*bbox_target_list)
]
flatten_labels = torch.cat(flatten_labels)
flatten_bbox_targets = torch.cat(flatten_bbox_targets)
return flatten_labels, flatten_bbox_targets
def _get_targets_single(self,
gt_instances: InstanceData,
featmap_size_list: List[tuple] = None,
priors_list: List[Tensor] = None) -> tuple:
"""Compute regression and classification targets for a single image.
Args:
gt_instances (:obj:`InstanceData`): Ground truth of instance
annotations. It usually includes ``bboxes`` and ``labels``
attributes.
featmap_size_list (list[tuple]): Size tuple of feature maps.
priors_list (list[Tensor]): Priors of each fpn level, each has
shape (num_priors, 2).
Returns:
tuple:
- label_list (list[Tensor]): Labels of all anchors in the image.
- box_target_list (list[Tensor]): BBox targets of all anchors in
the image.
"""
gt_bboxes_raw = gt_instances.bboxes
gt_labels_raw = gt_instances.labels
gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
(gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
label_list = []
bbox_target_list = []
# for each pyramid, find the cls and box target
for base_len, (lower_bound, upper_bound), stride, featmap_size, \
priors in zip(self.base_edge_list, self.scale_ranges,
self.strides, featmap_size_list, priors_list):
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
priors = priors.view(*featmap_size, 2)
x, y = priors[..., 0], priors[..., 1]
labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
featmap_size[1], 4)
# scale assignment
hit_indices = ((gt_areas >= lower_bound) &
(gt_areas <= upper_bound)).nonzero().flatten()
if len(hit_indices) == 0:
label_list.append(labels)
bbox_target_list.append(torch.log(bbox_targets))
continue
_, hit_index_order = torch.sort(-gt_areas[hit_indices])
hit_indices = hit_indices[hit_index_order]
gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
gt_labels = gt_labels_raw[hit_indices]
half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
# valid fovea area: left, right, top, down
pos_left = torch.ceil(
gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
clamp(0, featmap_size[1] - 1)
pos_right = torch.floor(
gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
clamp(0, featmap_size[1] - 1)
pos_top = torch.ceil(
gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
clamp(0, featmap_size[0] - 1)
pos_down = torch.floor(
gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
clamp(0, featmap_size[0] - 1)
for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
gt_bboxes_raw[hit_indices, :]):
labels[py1:py2 + 1, px1:px2 + 1] = label
bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
(x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
(y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
(gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
(gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
label_list.append(labels)
bbox_target_list.append(torch.log(bbox_targets))
return label_list, bbox_target_list
# Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
def _predict_by_feat_single(self,
cls_score_list: List[Tensor],
bbox_pred_list: List[Tensor],
score_factor_list: List[Tensor],
mlvl_priors: List[Tensor],
img_meta: dict,
cfg: Optional[ConfigDict] = None,
rescale: bool = False,
with_nms: bool = True) -> InstanceData:
"""Transform a single image's features extracted from the head into
bbox results.
Args:
cls_score_list (list[Tensor]): Box scores from all scale
levels of a single image, each item has shape
(num_priors * num_classes, H, W).
bbox_pred_list (list[Tensor]): Box energies / deltas from
all scale levels of a single image, each item has shape
(num_priors * 4, H, W).
score_factor_list (list[Tensor]): Score factor from all scale
levels of a single image, each item has shape
(num_priors * 1, H, W).
mlvl_priors (list[Tensor]): Each element in the list is
the priors of a single level in feature pyramid, has shape
(num_priors, 2).
img_meta (dict): Image meta info.
cfg (ConfigDict, optional): Test / postprocessing
configuration, if None, test_cfg would be used.
Defaults to None.
rescale (bool): If True, return boxes in original image space.
Defaults to False.
with_nms (bool): If True, do nms before return boxes.
Defaults to True.
Returns:
:obj:`InstanceData`: Detection results of each image
after the post process.
Each item usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_score_list) == len(bbox_pred_list)
img_shape = img_meta['img_shape']
nms_pre = cfg.get('nms_pre', -1)
mlvl_bboxes = []
mlvl_scores = []
mlvl_labels = []
for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
self.base_edge_list, mlvl_priors)):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
# After https://github.com/open-mmlab/mmdetection/pull/6268/,
# this operation keeps fewer bboxes under the same `nms_pre`.
# There is no difference in performance for most models. If you
# find a slight drop in performance, you can set a larger
# `nms_pre` than before.
results = filter_scores_and_topk(
scores, cfg.score_thr, nms_pre,
dict(bbox_pred=bbox_pred, priors=priors))
scores, labels, _, filtered_results = results
bbox_pred = filtered_results['bbox_pred']
priors = filtered_results['priors']
bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_labels.append(labels)
results = InstanceData()
results.bboxes = torch.cat(mlvl_bboxes)
results.scores = torch.cat(mlvl_scores)
results.labels = torch.cat(mlvl_labels)
return self._bbox_post_process(
results=results,
cfg=cfg,
rescale=rescale,
with_nms=with_nms,
img_meta=img_meta)
def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
max_shape: int) -> Tensor:
"""Function to decode bbox.
Args:
priors (Tensor): Center proiors of an image, has shape
(num_instances, 2).
bbox_preds (Tensor): Box energies / deltas for all instances,
has shape (batch_size, num_instances, 4).
base_len (int): The base length.
max_shape (int): The max shape of bbox.
Returns:
Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
shape (batch_size, num_instances, 4).
"""
bbox_pred = bbox_pred.exp()
y = priors[:, 1]
x = priors[:, 0]
x1 = (x - base_len * bbox_pred[:, 0]). \
clamp(min=0, max=max_shape[1] - 1)
y1 = (y - base_len * bbox_pred[:, 1]). \
clamp(min=0, max=max_shape[0] - 1)
x2 = (x + base_len * bbox_pred[:, 2]). \
clamp(min=0, max=max_shape[1] - 1)
y2 = (y + base_len * bbox_pred[:, 3]). \
clamp(min=0, max=max_shape[0] - 1)
decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
return decoded_bboxes