Spaces:
Runtime error
Runtime error
# Copyright (c) OpenMMLab. All rights reserved. | |
from typing import Dict, List, Optional, Sequence, Tuple | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule | |
from torch import Tensor | |
from mmdet.registry import MODELS, TASK_UTILS | |
from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList | |
from ..losses import smooth_l1_loss | |
from ..task_modules.samplers import PseudoSampler | |
from ..utils import multi_apply | |
from .anchor_head import AnchorHead | |
# TODO: add loss evaluator for SSD | |
class SSDHead(AnchorHead): | |
"""Implementation of `SSD head <https://arxiv.org/abs/1512.02325>`_ | |
Args: | |
num_classes (int): Number of categories excluding the background | |
category. | |
in_channels (Sequence[int]): Number of channels in the input feature | |
map. | |
stacked_convs (int): Number of conv layers in cls and reg tower. | |
Defaults to 0. | |
feat_channels (int): Number of hidden channels when stacked_convs | |
> 0. Defaults to 256. | |
use_depthwise (bool): Whether to use DepthwiseSeparableConv. | |
Defaults to False. | |
conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct | |
and config conv layer. Defaults to None. | |
norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct | |
and config norm layer. Defaults to None. | |
act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct | |
and config activation layer. Defaults to None. | |
anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor | |
generator. | |
bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder. | |
reg_decoded_bbox (bool): If true, the regression loss would be | |
applied directly on decoded bounding boxes, converting both | |
the predicted boxes and regression targets to absolute | |
coordinates format. Defaults to False. It should be `True` when | |
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. | |
train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of | |
anchor head. | |
test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of | |
anchor head. | |
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ | |
dict], Optional): Initialization config dict. | |
""" # noqa: W605 | |
def __init__( | |
self, | |
num_classes: int = 80, | |
in_channels: Sequence[int] = (512, 1024, 512, 256, 256, 256), | |
stacked_convs: int = 0, | |
feat_channels: int = 256, | |
use_depthwise: bool = False, | |
conv_cfg: Optional[ConfigType] = None, | |
norm_cfg: Optional[ConfigType] = None, | |
act_cfg: Optional[ConfigType] = None, | |
anchor_generator: ConfigType = dict( | |
type='SSDAnchorGenerator', | |
scale_major=False, | |
input_size=300, | |
strides=[8, 16, 32, 64, 100, 300], | |
ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), | |
basesize_ratio_range=(0.1, 0.9)), | |
bbox_coder: ConfigType = dict( | |
type='DeltaXYWHBBoxCoder', | |
clip_border=True, | |
target_means=[.0, .0, .0, .0], | |
target_stds=[1.0, 1.0, 1.0, 1.0], | |
), | |
reg_decoded_bbox: bool = False, | |
train_cfg: Optional[ConfigType] = None, | |
test_cfg: Optional[ConfigType] = None, | |
init_cfg: MultiConfig = dict( | |
type='Xavier', layer='Conv2d', distribution='uniform', bias=0) | |
) -> None: | |
super(AnchorHead, self).__init__(init_cfg=init_cfg) | |
self.num_classes = num_classes | |
self.in_channels = in_channels | |
self.stacked_convs = stacked_convs | |
self.feat_channels = feat_channels | |
self.use_depthwise = use_depthwise | |
self.conv_cfg = conv_cfg | |
self.norm_cfg = norm_cfg | |
self.act_cfg = act_cfg | |
self.cls_out_channels = num_classes + 1 # add background class | |
self.prior_generator = TASK_UTILS.build(anchor_generator) | |
# Usually the numbers of anchors for each level are the same | |
# except SSD detectors. So it is an int in the most dense | |
# heads but a list of int in SSDHead | |
self.num_base_priors = self.prior_generator.num_base_priors | |
self._init_layers() | |
self.bbox_coder = TASK_UTILS.build(bbox_coder) | |
self.reg_decoded_bbox = reg_decoded_bbox | |
self.use_sigmoid_cls = False | |
self.cls_focal_loss = False | |
self.train_cfg = train_cfg | |
self.test_cfg = test_cfg | |
if self.train_cfg: | |
self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) | |
if self.train_cfg.get('sampler', None) is not None: | |
self.sampler = TASK_UTILS.build( | |
self.train_cfg['sampler'], default_args=dict(context=self)) | |
else: | |
self.sampler = PseudoSampler(context=self) | |
def _init_layers(self) -> None: | |
"""Initialize layers of the head.""" | |
self.cls_convs = nn.ModuleList() | |
self.reg_convs = nn.ModuleList() | |
# TODO: Use registry to choose ConvModule type | |
conv = DepthwiseSeparableConvModule \ | |
if self.use_depthwise else ConvModule | |
for channel, num_base_priors in zip(self.in_channels, | |
self.num_base_priors): | |
cls_layers = [] | |
reg_layers = [] | |
in_channel = channel | |
# build stacked conv tower, not used in default ssd | |
for i in range(self.stacked_convs): | |
cls_layers.append( | |
conv( | |
in_channel, | |
self.feat_channels, | |
3, | |
padding=1, | |
conv_cfg=self.conv_cfg, | |
norm_cfg=self.norm_cfg, | |
act_cfg=self.act_cfg)) | |
reg_layers.append( | |
conv( | |
in_channel, | |
self.feat_channels, | |
3, | |
padding=1, | |
conv_cfg=self.conv_cfg, | |
norm_cfg=self.norm_cfg, | |
act_cfg=self.act_cfg)) | |
in_channel = self.feat_channels | |
# SSD-Lite head | |
if self.use_depthwise: | |
cls_layers.append( | |
ConvModule( | |
in_channel, | |
in_channel, | |
3, | |
padding=1, | |
groups=in_channel, | |
conv_cfg=self.conv_cfg, | |
norm_cfg=self.norm_cfg, | |
act_cfg=self.act_cfg)) | |
reg_layers.append( | |
ConvModule( | |
in_channel, | |
in_channel, | |
3, | |
padding=1, | |
groups=in_channel, | |
conv_cfg=self.conv_cfg, | |
norm_cfg=self.norm_cfg, | |
act_cfg=self.act_cfg)) | |
cls_layers.append( | |
nn.Conv2d( | |
in_channel, | |
num_base_priors * self.cls_out_channels, | |
kernel_size=1 if self.use_depthwise else 3, | |
padding=0 if self.use_depthwise else 1)) | |
reg_layers.append( | |
nn.Conv2d( | |
in_channel, | |
num_base_priors * 4, | |
kernel_size=1 if self.use_depthwise else 3, | |
padding=0 if self.use_depthwise else 1)) | |
self.cls_convs.append(nn.Sequential(*cls_layers)) | |
self.reg_convs.append(nn.Sequential(*reg_layers)) | |
def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: | |
"""Forward features from the upstream network. | |
Args: | |
x (tuple[Tensor]): Features from the upstream network, each is | |
a 4D-tensor. | |
Returns: | |
tuple[list[Tensor], list[Tensor]]: A tuple of cls_scores list and | |
bbox_preds list. | |
- cls_scores (list[Tensor]): Classification scores for all scale \ | |
levels, each is a 4D-tensor, the channels number is \ | |
num_anchors * num_classes. | |
- bbox_preds (list[Tensor]): Box energies / deltas for all scale \ | |
levels, each is a 4D-tensor, the channels number is \ | |
num_anchors * 4. | |
""" | |
cls_scores = [] | |
bbox_preds = [] | |
for feat, reg_conv, cls_conv in zip(x, self.reg_convs, self.cls_convs): | |
cls_scores.append(cls_conv(feat)) | |
bbox_preds.append(reg_conv(feat)) | |
return cls_scores, bbox_preds | |
def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, | |
anchor: Tensor, labels: Tensor, | |
label_weights: Tensor, bbox_targets: Tensor, | |
bbox_weights: Tensor, | |
avg_factor: int) -> Tuple[Tensor, Tensor]: | |
"""Compute loss of a single image. | |
Args: | |
cls_score (Tensor): Box scores for eachimage | |
Has shape (num_total_anchors, num_classes). | |
bbox_pred (Tensor): Box energies / deltas for each image | |
level with shape (num_total_anchors, 4). | |
anchors (Tensor): Box reference for each scale level with shape | |
(num_total_anchors, 4). | |
labels (Tensor): Labels of each anchors with shape | |
(num_total_anchors,). | |
label_weights (Tensor): Label weights of each anchor with shape | |
(num_total_anchors,) | |
bbox_targets (Tensor): BBox regression targets of each anchor | |
weight shape (num_total_anchors, 4). | |
bbox_weights (Tensor): BBox regression loss weights of each anchor | |
with shape (num_total_anchors, 4). | |
avg_factor (int): Average factor that is used to average | |
the loss. When using sampling method, avg_factor is usually | |
the sum of positive and negative priors. When using | |
`PseudoSampler`, `avg_factor` is usually equal to the number | |
of positive priors. | |
Returns: | |
Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one | |
feature map. | |
""" | |
loss_cls_all = F.cross_entropy( | |
cls_score, labels, reduction='none') * label_weights | |
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes | |
pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero( | |
as_tuple=False).reshape(-1) | |
neg_inds = (labels == self.num_classes).nonzero( | |
as_tuple=False).view(-1) | |
num_pos_samples = pos_inds.size(0) | |
num_neg_samples = self.train_cfg['neg_pos_ratio'] * num_pos_samples | |
if num_neg_samples > neg_inds.size(0): | |
num_neg_samples = neg_inds.size(0) | |
topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) | |
loss_cls_pos = loss_cls_all[pos_inds].sum() | |
loss_cls_neg = topk_loss_cls_neg.sum() | |
loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor | |
if self.reg_decoded_bbox: | |
# When the regression loss (e.g. `IouLoss`, `GIouLoss`) | |
# is applied directly on the decoded bounding boxes, it | |
# decodes the already encoded coordinates to absolute format. | |
bbox_pred = self.bbox_coder.decode(anchor, bbox_pred) | |
loss_bbox = smooth_l1_loss( | |
bbox_pred, | |
bbox_targets, | |
bbox_weights, | |
beta=self.train_cfg['smoothl1_beta'], | |
avg_factor=avg_factor) | |
return loss_cls[None], loss_bbox | |
def loss_by_feat( | |
self, | |
cls_scores: List[Tensor], | |
bbox_preds: List[Tensor], | |
batch_gt_instances: InstanceList, | |
batch_img_metas: List[dict], | |
batch_gt_instances_ignore: OptInstanceList = None | |
) -> Dict[str, List[Tensor]]: | |
"""Compute losses of the head. | |
Args: | |
cls_scores (list[Tensor]): Box scores for each scale level | |
Has shape (N, num_anchors * num_classes, H, W) | |
bbox_preds (list[Tensor]): Box energies / deltas for each scale | |
level with shape (N, num_anchors * 4, H, W) | |
batch_gt_instances (list[:obj:`InstanceData`]): Batch of | |
gt_instance. It usually includes ``bboxes`` and ``labels`` | |
attributes. | |
batch_img_metas (list[dict]): Meta information of each image, e.g., | |
image size, scaling factor, etc. | |
batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): | |
Batch of gt_instances_ignore. It includes ``bboxes`` attribute | |
data that is ignored during training and testing. | |
Defaults to None. | |
Returns: | |
dict[str, list[Tensor]]: A dictionary of loss components. the dict | |
has components below: | |
- loss_cls (list[Tensor]): A list containing each feature map \ | |
classification loss. | |
- loss_bbox (list[Tensor]): A list containing each feature map \ | |
regression loss. | |
""" | |
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] | |
assert len(featmap_sizes) == self.prior_generator.num_levels | |
device = cls_scores[0].device | |
anchor_list, valid_flag_list = self.get_anchors( | |
featmap_sizes, batch_img_metas, device=device) | |
cls_reg_targets = self.get_targets( | |
anchor_list, | |
valid_flag_list, | |
batch_gt_instances, | |
batch_img_metas, | |
batch_gt_instances_ignore=batch_gt_instances_ignore, | |
unmap_outputs=True) | |
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, | |
avg_factor) = cls_reg_targets | |
num_images = len(batch_img_metas) | |
all_cls_scores = torch.cat([ | |
s.permute(0, 2, 3, 1).reshape( | |
num_images, -1, self.cls_out_channels) for s in cls_scores | |
], 1) | |
all_labels = torch.cat(labels_list, -1).view(num_images, -1) | |
all_label_weights = torch.cat(label_weights_list, | |
-1).view(num_images, -1) | |
all_bbox_preds = torch.cat([ | |
b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) | |
for b in bbox_preds | |
], -2) | |
all_bbox_targets = torch.cat(bbox_targets_list, | |
-2).view(num_images, -1, 4) | |
all_bbox_weights = torch.cat(bbox_weights_list, | |
-2).view(num_images, -1, 4) | |
# concat all level anchors to a single tensor | |
all_anchors = [] | |
for i in range(num_images): | |
all_anchors.append(torch.cat(anchor_list[i])) | |
losses_cls, losses_bbox = multi_apply( | |
self.loss_by_feat_single, | |
all_cls_scores, | |
all_bbox_preds, | |
all_anchors, | |
all_labels, | |
all_label_weights, | |
all_bbox_targets, | |
all_bbox_weights, | |
avg_factor=avg_factor) | |
return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) | |