# Copyright (c) OpenMMLab. All rights reserved. import copy from typing import List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule from mmcv.ops import batched_nms from mmengine.config import ConfigDict from mmengine.structures import InstanceData from torch import Tensor from mmdet.registry import MODELS from mmdet.structures.bbox import (cat_boxes, empty_box_as, get_box_tensor, get_box_wh, scale_boxes) from mmdet.utils import InstanceList, MultiConfig, OptInstanceList from .anchor_head import AnchorHead @MODELS.register_module() class RPNHead(AnchorHead): """Implementation of RPN head. Args: in_channels (int): Number of channels in the input feature map. num_classes (int): Number of categories excluding the background category. Defaults to 1. init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ list[dict]): Initialization config dict. num_convs (int): Number of convolution layers in the head. Defaults to 1. """ # noqa: W605 def __init__(self, in_channels: int, num_classes: int = 1, init_cfg: MultiConfig = dict( type='Normal', layer='Conv2d', std=0.01), num_convs: int = 1, **kwargs) -> None: self.num_convs = num_convs assert num_classes == 1 super().__init__( num_classes=num_classes, in_channels=in_channels, init_cfg=init_cfg, **kwargs) def _init_layers(self) -> None: """Initialize layers of the head.""" if self.num_convs > 1: rpn_convs = [] for i in range(self.num_convs): if i == 0: in_channels = self.in_channels else: in_channels = self.feat_channels # use ``inplace=False`` to avoid error: one of the variables # needed for gradient computation has been modified by an # inplace operation. rpn_convs.append( ConvModule( in_channels, self.feat_channels, 3, padding=1, inplace=False)) self.rpn_conv = nn.Sequential(*rpn_convs) else: self.rpn_conv = nn.Conv2d( self.in_channels, self.feat_channels, 3, padding=1) self.rpn_cls = nn.Conv2d(self.feat_channels, self.num_base_priors * self.cls_out_channels, 1) reg_dim = self.bbox_coder.encode_size self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * reg_dim, 1) def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: """Forward feature of a single scale level. Args: x (Tensor): Features of a single scale level. Returns: tuple: cls_score (Tensor): Cls scores for a single scale level \ the channels number is num_base_priors * num_classes. bbox_pred (Tensor): Box energies / deltas for a single scale \ level, the channels number is num_base_priors * 4. """ x = self.rpn_conv(x) x = F.relu(x) rpn_cls_score = self.rpn_cls(x) rpn_bbox_pred = self.rpn_reg(x) return rpn_cls_score, rpn_bbox_pred def loss_by_feat(self, cls_scores: List[Tensor], bbox_preds: List[Tensor], batch_gt_instances: InstanceList, batch_img_metas: List[dict], batch_gt_instances_ignore: OptInstanceList = None) \ -> dict: """Calculate the loss based on the features extracted by the detection head. Args: cls_scores (list[Tensor]): Box scores for each scale level, has shape (N, num_anchors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W). batch_gt_instances (list[obj:InstanceData]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. batch_gt_instances_ignore (list[obj:InstanceData], Optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Returns: dict[str, Tensor]: A dictionary of loss components. """ losses = super().loss_by_feat( cls_scores, bbox_preds, batch_gt_instances, batch_img_metas, batch_gt_instances_ignore=batch_gt_instances_ignore) return dict( loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox']) def _predict_by_feat_single(self, cls_score_list: List[Tensor], bbox_pred_list: List[Tensor], score_factor_list: List[Tensor], mlvl_priors: List[Tensor], img_meta: dict, cfg: ConfigDict, rescale: bool = False, with_nms: bool = True) -> InstanceData: """Transform a single image's features extracted from the head into bbox results. Args: cls_score_list (list[Tensor]): Box scores from all scale levels of a single image, each item has shape (num_priors * num_classes, H, W). bbox_pred_list (list[Tensor]): Box energies / deltas from all scale levels of a single image, each item has shape (num_priors * 4, H, W). score_factor_list (list[Tensor]): Be compatible with BaseDenseHead. Not used in RPNHead. mlvl_priors (list[Tensor]): Each element in the list is the priors of a single level in feature pyramid. In all anchor-based methods, it has shape (num_priors, 4). In all anchor-free methods, it has shape (num_priors, 2) when `with_stride=True`, otherwise it still has shape (num_priors, 4). img_meta (dict): Image meta info. cfg (ConfigDict, optional): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Defaults to False. Returns: :obj:`InstanceData`: Detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ cfg = self.test_cfg if cfg is None else cfg cfg = copy.deepcopy(cfg) img_shape = img_meta['img_shape'] nms_pre = cfg.get('nms_pre', -1) mlvl_bbox_preds = [] mlvl_valid_priors = [] mlvl_scores = [] level_ids = [] for level_idx, (cls_score, bbox_pred, priors) in \ enumerate(zip(cls_score_list, bbox_pred_list, mlvl_priors)): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] reg_dim = self.bbox_coder.encode_size bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, reg_dim) cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: # remind that we set FG labels to [0] since mmdet v2.0 # BG cat_id: 1 scores = cls_score.softmax(-1)[:, :-1] scores = torch.squeeze(scores) if 0 < nms_pre < scores.shape[0]: # sort is faster than topk # _, topk_inds = scores.topk(cfg.nms_pre) ranked_scores, rank_inds = scores.sort(descending=True) topk_inds = rank_inds[:nms_pre] scores = ranked_scores[:nms_pre] bbox_pred = bbox_pred[topk_inds, :] priors = priors[topk_inds] mlvl_bbox_preds.append(bbox_pred) mlvl_valid_priors.append(priors) mlvl_scores.append(scores) # use level id to implement the separate level nms level_ids.append( scores.new_full((scores.size(0), ), level_idx, dtype=torch.long)) bbox_pred = torch.cat(mlvl_bbox_preds) priors = cat_boxes(mlvl_valid_priors) bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) results = InstanceData() results.bboxes = bboxes results.scores = torch.cat(mlvl_scores) results.level_ids = torch.cat(level_ids) return self._bbox_post_process( results=results, cfg=cfg, rescale=rescale, img_meta=img_meta) def _bbox_post_process(self, results: InstanceData, cfg: ConfigDict, rescale: bool = False, with_nms: bool = True, img_meta: Optional[dict] = None) -> InstanceData: """bbox post-processing method. The boxes would be rescaled to the original image scale and do the nms operation. Args: results (:obj:`InstaceData`): Detection instance results, each item has shape (num_bboxes, ). cfg (ConfigDict): Test / postprocessing configuration. rescale (bool): If True, return boxes in original image space. Defaults to False. with_nms (bool): If True, do nms before return boxes. Default to True. img_meta (dict, optional): Image meta info. Defaults to None. Returns: :obj:`InstanceData`: Detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ assert with_nms, '`with_nms` must be True in RPNHead' if rescale: assert img_meta.get('scale_factor') is not None scale_factor = [1 / s for s in img_meta['scale_factor']] results.bboxes = scale_boxes(results.bboxes, scale_factor) # filter small size bboxes if cfg.get('min_bbox_size', -1) >= 0: w, h = get_box_wh(results.bboxes) valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) if not valid_mask.all(): results = results[valid_mask] if results.bboxes.numel() > 0: bboxes = get_box_tensor(results.bboxes) det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, results.level_ids, cfg.nms) results = results[keep_idxs] # some nms would reweight the score, such as softnms results.scores = det_bboxes[:, -1] results = results[:cfg.max_per_img] # TODO: This would unreasonably show the 0th class label # in visualization results.labels = results.scores.new_zeros( len(results), dtype=torch.long) del results.level_ids else: # To avoid some potential error results_ = InstanceData() results_.bboxes = empty_box_as(results.bboxes) results_.scores = results.scores.new_zeros(0) results_.labels = results.scores.new_zeros(0) results = results_ return results