# Copyright (c) Facebook, Inc. and its affiliates. import itertools import logging import numpy as np from collections import OrderedDict from collections.abc import Mapping from typing import Dict, List, Optional, Tuple, Union import torch from omegaconf import DictConfig, OmegaConf from torch import Tensor, nn from annotator.oneformer.detectron2.layers import ShapeSpec from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances from annotator.oneformer.detectron2.utils.events import get_event_storage from .backbone import Backbone logger = logging.getLogger(__name__) def _to_container(cfg): """ mmdet will assert the type of dict/list. So convert omegaconf objects to dict/list. """ if isinstance(cfg, DictConfig): cfg = OmegaConf.to_container(cfg, resolve=True) from mmcv.utils import ConfigDict return ConfigDict(cfg) class MMDetBackbone(Backbone): """ Wrapper of mmdetection backbones to use in detectron2. mmdet backbones produce list/tuple of tensors, while detectron2 backbones produce a dict of tensors. This class wraps the given backbone to produce output in detectron2's convention, so it can be used in place of detectron2 backbones. """ def __init__( self, backbone: Union[nn.Module, Mapping], neck: Union[nn.Module, Mapping, None] = None, *, output_shapes: List[ShapeSpec], output_names: Optional[List[str]] = None, ): """ Args: backbone: either a backbone module or a mmdet config dict that defines a backbone. The backbone takes a 4D image tensor and returns a sequence of tensors. neck: either a backbone module or a mmdet config dict that defines a neck. The neck takes outputs of backbone and returns a sequence of tensors. If None, no neck is used. output_shapes: shape for every output of the backbone (or neck, if given). stride and channels are often needed. output_names: names for every output of the backbone (or neck, if given). By default, will use "out0", "out1", ... """ super().__init__() if isinstance(backbone, Mapping): from mmdet.models import build_backbone backbone = build_backbone(_to_container(backbone)) self.backbone = backbone if isinstance(neck, Mapping): from mmdet.models import build_neck neck = build_neck(_to_container(neck)) self.neck = neck # "Neck" weights, if any, are part of neck itself. This is the interface # of mmdet so we follow it. Reference: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py logger.info("Initializing mmdet backbone weights...") self.backbone.init_weights() # train() in mmdet modules is non-trivial, and has to be explicitly # called. Reference: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py self.backbone.train() if self.neck is not None: logger.info("Initializing mmdet neck weights ...") if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() self.neck.train() self._output_shapes = output_shapes if not output_names: output_names = [f"out{i}" for i in range(len(output_shapes))] self._output_names = output_names def forward(self, x) -> Dict[str, Tensor]: outs = self.backbone(x) if self.neck is not None: outs = self.neck(outs) assert isinstance( outs, (list, tuple) ), "mmdet backbone should return a list/tuple of tensors!" if len(outs) != len(self._output_shapes): raise ValueError( "Length of output_shapes does not match outputs from the mmdet backbone: " f"{len(outs)} != {len(self._output_shapes)}" ) return {k: v for k, v in zip(self._output_names, outs)} def output_shape(self) -> Dict[str, ShapeSpec]: return {k: v for k, v in zip(self._output_names, self._output_shapes)} class MMDetDetector(nn.Module): """ Wrapper of a mmdetection detector model, for detection and instance segmentation. Input/output formats of this class follow detectron2's convention, so a mmdetection model can be trained and evaluated in detectron2. """ def __init__( self, detector: Union[nn.Module, Mapping], *, # Default is 32 regardless of model: # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets size_divisibility=32, pixel_mean: Tuple[float], pixel_std: Tuple[float], ): """ Args: detector: a mmdet detector, or a mmdet config dict that defines a detector. size_divisibility: pad input images to multiple of this number pixel_mean: per-channel mean to normalize input image pixel_std: per-channel stddev to normalize input image """ super().__init__() if isinstance(detector, Mapping): from mmdet.models import build_detector detector = build_detector(_to_container(detector)) self.detector = detector self.detector.init_weights() self.size_divisibility = size_divisibility self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) assert ( self.pixel_mean.shape == self.pixel_std.shape ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor metas = [] rescale = {"height" in x for x in batched_inputs} if len(rescale) != 1: raise ValueError("Some inputs have original height/width, but some don't!") rescale = list(rescale)[0] output_shapes = [] for input in batched_inputs: meta = {} c, h, w = input["image"].shape meta["img_shape"] = meta["ori_shape"] = (h, w, c) if rescale: scale_factor = np.array( [w / input["width"], h / input["height"]] * 2, dtype="float32" ) ori_shape = (input["height"], input["width"]) output_shapes.append(ori_shape) meta["ori_shape"] = ori_shape + (c,) else: scale_factor = 1.0 output_shapes.append((h, w)) meta["scale_factor"] = scale_factor meta["flip"] = False padh, padw = images.shape[-2:] meta["pad_shape"] = (padh, padw, c) metas.append(meta) if self.training: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] if gt_instances[0].has("gt_masks"): from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks def convert_mask(m, shape): # mmdet mask format if isinstance(m, BitMasks): return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1]) else: return mm_PolygonMasks(m.polygons, shape[0], shape[1]) gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances] losses_and_metrics = self.detector.forward_train( images, metas, [x.gt_boxes.tensor for x in gt_instances], [x.gt_classes for x in gt_instances], gt_masks=gt_masks, ) else: losses_and_metrics = self.detector.forward_train( images, metas, [x.gt_boxes.tensor for x in gt_instances], [x.gt_classes for x in gt_instances], ) return _parse_losses(losses_and_metrics) else: results = self.detector.simple_test(images, metas, rescale=rescale) results = [ {"instances": _convert_mmdet_result(r, shape)} for r, shape in zip(results, output_shapes) ] return results @property def device(self): return self.pixel_mean.device # Reference: show_result() in # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances: if isinstance(result, tuple): bbox_result, segm_result = result if isinstance(segm_result, tuple): segm_result = segm_result[0] else: bbox_result, segm_result = result, None bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5 bboxes, scores = bboxes[:, :4], bboxes[:, -1] labels = [ torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result) ] labels = torch.cat(labels) inst = Instances(shape) inst.pred_boxes = Boxes(bboxes) inst.scores = scores inst.pred_classes = labels if segm_result is not None and len(labels) > 0: segm_result = list(itertools.chain(*segm_result)) segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result] segm_result = torch.stack(segm_result, dim=0) inst.pred_masks = segm_result return inst # reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]: log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError(f"{loss_name} is not a tensor or list of tensors") if "loss" not in loss_name: # put metrics to storage; don't return them storage = get_event_storage() value = log_vars.pop(loss_name).cpu().item() storage.put_scalar(loss_name, value) return log_vars