# Copyright (c) OpenMMLab. All rights reserved. import copy import inspect import math from typing import List, Optional, Sequence, Tuple, Union import cv2 import mmcv import numpy as np from mmcv.image.geometric import _scale_size from mmcv.transforms import BaseTransform from mmcv.transforms import Pad as MMCV_Pad from mmcv.transforms import RandomFlip as MMCV_RandomFlip from mmcv.transforms import Resize as MMCV_Resize from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness from mmengine.dataset import BaseDataset from mmengine.utils import is_str from numpy import random from mmdet.registry import TRANSFORMS from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type from mmdet.structures.mask import BitmapMasks, PolygonMasks from mmdet.utils import log_img_scale try: from imagecorruptions import corrupt except ImportError: corrupt = None try: import albumentations from albumentations import Compose except ImportError: albumentations = None Compose = None Number = Union[int, float] @TRANSFORMS.register_module() class Resize(MMCV_Resize): """Resize images & bbox & seg. This transform resizes the input image according to ``scale`` or ``scale_factor``. Bboxes, masks, and seg map are then resized with the same scale factor. if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to resize. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_bboxes - gt_masks - gt_seg_map Added Keys: - scale - scale_factor - keep_ratio - homography_matrix Args: scale (int or tuple): Images scales for resizing. Defaults to None scale_factor (float or tuple[float]): Scale factors for resizing. Defaults to None. keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. Defaults to False. clip_object_border (bool): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. backend (str): Image resize backend, choices are 'cv2' and 'pillow'. These two backends generates slightly different results. Defaults to 'cv2'. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def _resize_masks(self, results: dict) -> None: """Resize masks with ``results['scale']``""" if results.get('gt_masks', None) is not None: if self.keep_ratio: results['gt_masks'] = results['gt_masks'].rescale( results['scale']) else: results['gt_masks'] = results['gt_masks'].resize( results['img_shape']) def _resize_bboxes(self, results: dict) -> None: """Resize bounding boxes with ``results['scale_factor']``.""" if results.get('gt_bboxes', None) is not None: results['gt_bboxes'].rescale_(results['scale_factor']) if self.clip_object_border: results['gt_bboxes'].clip_(results['img_shape']) def _resize_seg(self, results: dict) -> None: """Resize semantic segmentation map with ``results['scale']``.""" if results.get('gt_seg_map', None) is not None: if self.keep_ratio: gt_seg = mmcv.imrescale( results['gt_seg_map'], results['scale'], interpolation='nearest', backend=self.backend) else: gt_seg = mmcv.imresize( results['gt_seg_map'], results['scale'], interpolation='nearest', backend=self.backend) results['gt_seg_map'] = gt_seg def _record_homography_matrix(self, results: dict) -> None: """Record the homography matrix for the Resize.""" w_scale, h_scale = results['scale_factor'] homography_matrix = np.array( [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32) if results.get('homography_matrix', None) is None: results['homography_matrix'] = homography_matrix else: results['homography_matrix'] = homography_matrix @ results[ 'homography_matrix'] @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to resize images, bounding boxes and semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys are updated in result dict. """ if self.scale: results['scale'] = self.scale else: img_shape = results['img'].shape[:2] results['scale'] = _scale_size(img_shape[::-1], self.scale_factor) self._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) self._resize_seg(results) self._record_homography_matrix(results) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(scale={self.scale}, ' repr_str += f'scale_factor={self.scale_factor}, ' repr_str += f'keep_ratio={self.keep_ratio}, ' repr_str += f'clip_object_border={self.clip_object_border}), ' repr_str += f'backend={self.backend}), ' repr_str += f'interpolation={self.interpolation})' return repr_str @TRANSFORMS.register_module() class FixShapeResize(Resize): """Resize images & bbox & seg to the specified size. This transform resizes the input image according to ``width`` and ``height``. Bboxes, masks, and seg map are then resized with the same parameters. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_bboxes - gt_masks - gt_seg_map Added Keys: - scale - scale_factor - keep_ratio - homography_matrix Args: width (int): width for resizing. height (int): height for resizing. Defaults to None. pad_val (Number | dict[str, Number], optional): Padding value for if the pad_mode is "constant". If it is a single number, the value to pad the image is the number and to pad the semantic segmentation map is 255. If it is a dict, it should have the following keys: - img: The value to pad the image. - seg: The value to pad the semantic segmentation map. Defaults to dict(img=0, seg=255). keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. Defaults to False. clip_object_border (bool): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. backend (str): Image resize backend, choices are 'cv2' and 'pillow'. These two backends generates slightly different results. Defaults to 'cv2'. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, width: int, height: int, pad_val: Union[Number, dict] = dict(img=0, seg=255), keep_ratio: bool = False, clip_object_border: bool = True, backend: str = 'cv2', interpolation: str = 'bilinear') -> None: assert width is not None and height is not None, ( '`width` and' '`height` can not be `None`') self.width = width self.height = height self.scale = (width, height) self.backend = backend self.interpolation = interpolation self.keep_ratio = keep_ratio self.clip_object_border = clip_object_border if keep_ratio is True: # padding to the fixed size when keep_ratio=True self.pad_transform = Pad(size=self.scale, pad_val=pad_val) @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to resize images, bounding boxes and semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys are updated in result dict. """ img = results['img'] h, w = img.shape[:2] if self.keep_ratio: scale_factor = min(self.width / w, self.height / h) results['scale_factor'] = (scale_factor, scale_factor) real_w, real_h = int(w * float(scale_factor) + 0.5), int(h * float(scale_factor) + 0.5) img, scale_factor = mmcv.imrescale( results['img'], (real_w, real_h), interpolation=self.interpolation, return_scale=True, backend=self.backend) # the w_scale and h_scale has minor difference # a real fix should be done in the mmcv.imrescale in the future results['img'] = img results['img_shape'] = img.shape[:2] results['keep_ratio'] = self.keep_ratio results['scale'] = (real_w, real_h) else: results['scale'] = (self.width, self.height) results['scale_factor'] = (self.width / w, self.height / h) super()._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) self._resize_seg(results) self._record_homography_matrix(results) if self.keep_ratio: self.pad_transform(results) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(width={self.width}, height={self.height}, ' repr_str += f'keep_ratio={self.keep_ratio}, ' repr_str += f'clip_object_border={self.clip_object_border}), ' repr_str += f'backend={self.backend}), ' repr_str += f'interpolation={self.interpolation})' return repr_str @TRANSFORMS.register_module() class RandomFlip(MMCV_RandomFlip): """Flip the image & bbox & mask & segmentation map. Added or Updated keys: flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip modes: - ``prob`` is float, ``direction`` is string: the image will be ``direction``ly flipped with probability of ``prob`` . E.g., ``prob=0.5``, ``direction='horizontal'``, then image will be horizontally flipped with probability of 0.5. - ``prob`` is float, ``direction`` is list of string: the image will be ``direction[i]``ly flipped with probability of ``prob/len(direction)``. E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``, then image will be horizontally flipped with probability of 0.25, vertically with probability of 0.25. - ``prob`` is list of float, ``direction`` is list of string: given ``len(prob) == len(direction)``, the image will be ``direction[i]``ly flipped with probability of ``prob[i]``. E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal', 'vertical']``, then image will be horizontally flipped with probability of 0.3, vertically with probability of 0.5. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - flip - flip_direction - homography_matrix Args: prob (float | list[float], optional): The flipping probability. Defaults to None. direction(str | list[str]): The flipping direction. Options If input is a list, the length must equal ``prob``. Each element in ``prob`` indicates the flip probability of corresponding direction. Defaults to 'horizontal'. """ def _record_homography_matrix(self, results: dict) -> None: """Record the homography matrix for the RandomFlip.""" cur_dir = results['flip_direction'] h, w = results['img'].shape[:2] if cur_dir == 'horizontal': homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]], dtype=np.float32) elif cur_dir == 'vertical': homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]], dtype=np.float32) elif cur_dir == 'diagonal': homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]], dtype=np.float32) else: homography_matrix = np.eye(3, dtype=np.float32) if results.get('homography_matrix', None) is None: results['homography_matrix'] = homography_matrix else: results['homography_matrix'] = homography_matrix @ results[ 'homography_matrix'] @autocast_box_type() def _flip(self, results: dict) -> None: """Flip images, bounding boxes, and semantic segmentation map.""" # flip image results['img'] = mmcv.imflip( results['img'], direction=results['flip_direction']) img_shape = results['img'].shape[:2] # flip bboxes if results.get('gt_bboxes', None) is not None: results['gt_bboxes'].flip_(img_shape, results['flip_direction']) # flip masks if results.get('gt_masks', None) is not None: results['gt_masks'] = results['gt_masks'].flip( results['flip_direction']) # flip segs if results.get('gt_seg_map', None) is not None: results['gt_seg_map'] = mmcv.imflip( results['gt_seg_map'], direction=results['flip_direction']) # record homography matrix for flip self._record_homography_matrix(results) @TRANSFORMS.register_module() class RandomShift(BaseTransform): """Shift the image and box given shift pixels and probability. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) - gt_bboxes_labels (np.int64) - gt_ignore_flags (bool) (optional) Modified Keys: - img - gt_bboxes - gt_bboxes_labels - gt_ignore_flags (bool) (optional) Args: prob (float): Probability of shifts. Defaults to 0.5. max_shift_px (int): The max pixels for shifting. Defaults to 32. filter_thr_px (int): The width and height threshold for filtering. The bbox and the rest of the targets below the width and height threshold will be filtered. Defaults to 1. """ def __init__(self, prob: float = 0.5, max_shift_px: int = 32, filter_thr_px: int = 1) -> None: assert 0 <= prob <= 1 assert max_shift_px >= 0 self.prob = prob self.max_shift_px = max_shift_px self.filter_thr_px = int(filter_thr_px) @cache_randomness def _random_prob(self) -> float: return random.uniform(0, 1) @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to random shift images, bounding boxes. Args: results (dict): Result dict from loading pipeline. Returns: dict: Shift results. """ if self._random_prob() < self.prob: img_shape = results['img'].shape[:2] random_shift_x = random.randint(-self.max_shift_px, self.max_shift_px) random_shift_y = random.randint(-self.max_shift_px, self.max_shift_px) new_x = max(0, random_shift_x) ori_x = max(0, -random_shift_x) new_y = max(0, random_shift_y) ori_y = max(0, -random_shift_y) # TODO: support mask and semantic segmentation maps. bboxes = results['gt_bboxes'].clone() bboxes.translate_([random_shift_x, random_shift_y]) # clip border bboxes.clip_(img_shape) # remove invalid bboxes valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & ( bboxes.heights > self.filter_thr_px).numpy() # If the shift does not contain any gt-bbox area, skip this # image. if not valid_inds.any(): return results bboxes = bboxes[valid_inds] results['gt_bboxes'] = bboxes results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ valid_inds] if results.get('gt_ignore_flags', None) is not None: results['gt_ignore_flags'] = \ results['gt_ignore_flags'][valid_inds] # shift img img = results['img'] new_img = np.zeros_like(img) img_h, img_w = img.shape[:2] new_h = img_h - np.abs(random_shift_y) new_w = img_w - np.abs(random_shift_x) new_img[new_y:new_y + new_h, new_x:new_x + new_w] \ = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w] results['img'] = new_img return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(prob={self.prob}, ' repr_str += f'max_shift_px={self.max_shift_px}, ' repr_str += f'filter_thr_px={self.filter_thr_px})' return repr_str @TRANSFORMS.register_module() class Pad(MMCV_Pad): """Pad the image & segmentation map. There are three padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. and (3)pad to square. Also, pad to square and pad to the minimum size can be used as the same time. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_masks - gt_seg_map Added Keys: - pad_shape - pad_fixed_size - pad_size_divisor Args: size (tuple, optional): Fixed padding size. Expected padding shape (width, height). Defaults to None. size_divisor (int, optional): The divisor of padded size. Defaults to None. pad_to_square (bool): Whether to pad the image into a square. Currently only used for YOLOX. Defaults to False. pad_val (Number | dict[str, Number], optional) - Padding value for if the pad_mode is "constant". If it is a single number, the value to pad the image is the number and to pad the semantic segmentation map is 255. If it is a dict, it should have the following keys: - img: The value to pad the image. - seg: The value to pad the semantic segmentation map. Defaults to dict(img=0, seg=255). padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Defaults to 'constant'. - constant: pads with a constant value, this value is specified with pad_val. - edge: pads with the last value at the edge of the image. - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2]. - symmetric: pads with reflection of image repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode will result in [2, 1, 1, 2, 3, 4, 4, 3] """ def _pad_masks(self, results: dict) -> None: """Pad masks according to ``results['pad_shape']``.""" if results.get('gt_masks', None) is not None: pad_val = self.pad_val.get('masks', 0) pad_shape = results['pad_shape'][:2] results['gt_masks'] = results['gt_masks'].pad( pad_shape, pad_val=pad_val) def transform(self, results: dict) -> dict: """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_seg(results) self._pad_masks(results) return results @TRANSFORMS.register_module() class RandomCrop(BaseTransform): """Random crop the image & bboxes & masks. The absolute ``crop_size`` is sampled based on ``crop_type`` and ``image_size``, then the cropped results are generated. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_ignore_flags (bool) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_masks (optional) - gt_ignore_flags (optional) - gt_seg_map (optional) Added Keys: - homography_matrix Args: crop_size (tuple): The relative ratio or absolute pixels of (width, height). crop_type (str, optional): One of "relative_range", "relative", "absolute", "absolute_range". "relative" randomly crops (h * crop_size[0], w * crop_size[1]) part from an input of size (h, w). "relative_range" uniformly samples relative crop size from range [crop_size[0], 1] and [crop_size[1], 1] for height and width respectively. "absolute" crops from an input with absolute size (crop_size[0], crop_size[1]). "absolute_range" uniformly samples crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w in range [crop_size[0], min(w, crop_size[1])]. Defaults to "absolute". allow_negative_crop (bool, optional): Whether to allow a crop that does not contain any bbox area. Defaults to False. recompute_bbox (bool, optional): Whether to re-compute the boxes based on cropped instance masks. Defaults to False. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. Note: - If the image is smaller than the absolute crop size, return the original image. - The keys for bboxes, labels and masks must be aligned. That is, ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and ``gt_masks_ignore``. - If the crop does not contain any gt-bbox region and ``allow_negative_crop`` is set to False, skip this image. """ def __init__(self, crop_size: tuple, crop_type: str = 'absolute', allow_negative_crop: bool = False, recompute_bbox: bool = False, bbox_clip_border: bool = True) -> None: if crop_type not in [ 'relative_range', 'relative', 'absolute', 'absolute_range' ]: raise ValueError(f'Invalid crop_type {crop_type}.') if crop_type in ['absolute', 'absolute_range']: assert crop_size[0] > 0 and crop_size[1] > 0 assert isinstance(crop_size[0], int) and isinstance( crop_size[1], int) if crop_type == 'absolute_range': assert crop_size[0] <= crop_size[1] else: assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1 self.crop_size = crop_size self.crop_type = crop_type self.allow_negative_crop = allow_negative_crop self.bbox_clip_border = bbox_clip_border self.recompute_bbox = recompute_bbox def _crop_data(self, results: dict, crop_size: Tuple[int, int], allow_negative_crop: bool) -> Union[dict, None]: """Function to randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. crop_size (Tuple[int, int]): Expected absolute size after cropping, (h, w). allow_negative_crop (bool): Whether to allow a crop that does not contain any bbox area. Returns: results (Union[dict, None]): Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. None will be returned when there is no valid bbox after cropping. """ assert crop_size[0] > 0 and crop_size[1] > 0 img = results['img'] margin_h = max(img.shape[0] - crop_size[0], 0) margin_w = max(img.shape[1] - crop_size[1], 0) offset_h, offset_w = self._rand_offset((margin_h, margin_w)) crop_y1, crop_y2 = offset_h, offset_h + crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + crop_size[1] # Record the homography matrix for the RandomCrop homography_matrix = np.array( [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]], dtype=np.float32) if results.get('homography_matrix', None) is None: results['homography_matrix'] = homography_matrix else: results['homography_matrix'] = homography_matrix @ results[ 'homography_matrix'] # crop the image img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] img_shape = img.shape results['img'] = img results['img_shape'] = img_shape # crop bboxes accordingly and clip to the image boundary if results.get('gt_bboxes', None) is not None: bboxes = results['gt_bboxes'] bboxes.translate_([-offset_w, -offset_h]) if self.bbox_clip_border: bboxes.clip_(img_shape[:2]) valid_inds = bboxes.is_inside(img_shape[:2]).numpy() # If the crop does not contain any gt-bbox area and # allow_negative_crop is False, skip this image. if (not valid_inds.any() and not allow_negative_crop): return None results['gt_bboxes'] = bboxes[valid_inds] if results.get('gt_ignore_flags', None) is not None: results['gt_ignore_flags'] = \ results['gt_ignore_flags'][valid_inds] if results.get('gt_bboxes_labels', None) is not None: results['gt_bboxes_labels'] = \ results['gt_bboxes_labels'][valid_inds] if results.get('gt_masks', None) is not None: results['gt_masks'] = results['gt_masks'][ valid_inds.nonzero()[0]].crop( np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) if self.recompute_bbox: results['gt_bboxes'] = results['gt_masks'].get_bboxes( type(results['gt_bboxes'])) # crop semantic seg if results.get('gt_seg_map', None) is not None: results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2, crop_x1:crop_x2] return results @cache_randomness def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]: """Randomly generate crop offset. Args: margin (Tuple[int, int]): The upper bound for the offset generated randomly. Returns: Tuple[int, int]: The random offset for the crop. """ margin_h, margin_w = margin offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) return offset_h, offset_w @cache_randomness def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]: """Randomly generates the absolute crop size based on `crop_type` and `image_size`. Args: image_size (Tuple[int, int]): (h, w). Returns: crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels. """ h, w = image_size if self.crop_type == 'absolute': return min(self.crop_size[1], h), min(self.crop_size[0], w) elif self.crop_type == 'absolute_range': crop_h = np.random.randint( min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1) crop_w = np.random.randint( min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1) return crop_h, crop_w elif self.crop_type == 'relative': crop_w, crop_h = self.crop_size return int(h * crop_h + 0.5), int(w * crop_w + 0.5) else: # 'relative_range' crop_size = np.asarray(self.crop_size, dtype=np.float32) crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size) return int(h * crop_h + 0.5), int(w * crop_w + 0.5) @autocast_box_type() def transform(self, results: dict) -> Union[dict, None]: """Transform function to randomly crop images, bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: results (Union[dict, None]): Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. None will be returned when there is no valid bbox after cropping. """ image_size = results['img'].shape[:2] crop_size = self._get_crop_size(image_size) results = self._crop_data(results, crop_size, self.allow_negative_crop) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(crop_size={self.crop_size}, ' repr_str += f'crop_type={self.crop_type}, ' repr_str += f'allow_negative_crop={self.allow_negative_crop}, ' repr_str += f'recompute_bbox={self.recompute_bbox}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @TRANSFORMS.register_module() class SegRescale(BaseTransform): """Rescale semantic segmentation maps. This transform rescale the ``gt_seg_map`` according to ``scale_factor``. Required Keys: - gt_seg_map Modified Keys: - gt_seg_map Args: scale_factor (float): The scale factor of the final output. Defaults to 1. backend (str): Image rescale backend, choices are 'cv2' and 'pillow'. These two backends generates slightly different results. Defaults to 'cv2'. """ def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None: self.scale_factor = scale_factor self.backend = backend def transform(self, results: dict) -> dict: """Transform function to scale the semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with semantic segmentation map scaled. """ if self.scale_factor != 1: results['gt_seg_map'] = mmcv.imrescale( results['gt_seg_map'], self.scale_factor, interpolation='nearest', backend=self.backend) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(scale_factor={self.scale_factor}, ' repr_str += f'backend={self.backend})' return repr_str @TRANSFORMS.register_module() class PhotoMetricDistortion(BaseTransform): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Required Keys: - img (np.uint8) Modified Keys: - img (np.float32) Args: brightness_delta (int): delta of brightness. contrast_range (sequence): range of contrast. saturation_range (sequence): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta: int = 32, contrast_range: Sequence[Number] = (0.5, 1.5), saturation_range: Sequence[Number] = (0.5, 1.5), hue_delta: int = 18) -> None: self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta @cache_randomness def _random_flags(self) -> Sequence[Number]: mode = random.randint(2) brightness_flag = random.randint(2) contrast_flag = random.randint(2) saturation_flag = random.randint(2) hue_flag = random.randint(2) swap_flag = random.randint(2) delta_value = random.uniform(-self.brightness_delta, self.brightness_delta) alpha_value = random.uniform(self.contrast_lower, self.contrast_upper) saturation_value = random.uniform(self.saturation_lower, self.saturation_upper) hue_value = random.uniform(-self.hue_delta, self.hue_delta) swap_value = random.permutation(3) return (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag, swap_flag, delta_value, alpha_value, saturation_value, hue_value, swap_value) def transform(self, results: dict) -> dict: """Transform function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ assert 'img' in results, '`img` is not found in results' img = results['img'] img = img.astype(np.float32) (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag, swap_flag, delta_value, alpha_value, saturation_value, hue_value, swap_value) = self._random_flags() # random brightness if brightness_flag: img += delta_value # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last if mode == 1: if contrast_flag: img *= alpha_value # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if saturation_flag: img[..., 1] *= saturation_value # For image(type=float32), after convert bgr to hsv by opencv, # valid saturation value range is [0, 1] if saturation_value > 1: img[..., 1] = img[..., 1].clip(0, 1) # random hue if hue_flag: img[..., 0] += hue_value img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if mode == 0: if contrast_flag: img *= alpha_value # randomly swap channels if swap_flag: img = img[..., swap_value] results['img'] = img return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(brightness_delta={self.brightness_delta}, ' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)}, ' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)}, ' repr_str += f'hue_delta={self.hue_delta})' return repr_str @TRANSFORMS.register_module() class Expand(BaseTransform): """Random expand the image & bboxes & masks & segmentation map. Randomly place the original image on a canvas of ``ratio`` x original image size filled with mean values. The ratio is in the range of ratio_range. Required Keys: - img - img_shape - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_bboxes - gt_masks - gt_seg_map Args: mean (sequence): mean value of dataset. to_rgb (bool): if need to convert the order of mean to align with RGB. ratio_range (sequence)): range of expand ratio. seg_ignore_label (int): label of ignore segmentation map. prob (float): probability of applying this transformation """ def __init__(self, mean: Sequence[Number] = (0, 0, 0), to_rgb: bool = True, ratio_range: Sequence[Number] = (1, 4), seg_ignore_label: int = None, prob: float = 0.5) -> None: self.to_rgb = to_rgb self.ratio_range = ratio_range if to_rgb: self.mean = mean[::-1] else: self.mean = mean self.min_ratio, self.max_ratio = ratio_range self.seg_ignore_label = seg_ignore_label self.prob = prob @cache_randomness def _random_prob(self) -> float: return random.uniform(0, 1) @cache_randomness def _random_ratio(self) -> float: return random.uniform(self.min_ratio, self.max_ratio) @cache_randomness def _random_left_top(self, ratio: float, h: int, w: int) -> Tuple[int, int]: left = int(random.uniform(0, w * ratio - w)) top = int(random.uniform(0, h * ratio - h)) return left, top @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to expand images, bounding boxes, masks, segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images, bounding boxes, masks, segmentation map expanded. """ if self._random_prob() > self.prob: return results assert 'img' in results, '`img` is not found in results' img = results['img'] h, w, c = img.shape ratio = self._random_ratio() # speedup expand when meets large image if np.all(self.mean == self.mean[0]): expand_img = np.empty((int(h * ratio), int(w * ratio), c), img.dtype) expand_img.fill(self.mean[0]) else: expand_img = np.full((int(h * ratio), int(w * ratio), c), self.mean, dtype=img.dtype) left, top = self._random_left_top(ratio, h, w) expand_img[top:top + h, left:left + w] = img results['img'] = expand_img results['img_shape'] = expand_img.shape[:2] # expand bboxes if results.get('gt_bboxes', None) is not None: results['gt_bboxes'].translate_([left, top]) # expand masks if results.get('gt_masks', None) is not None: results['gt_masks'] = results['gt_masks'].expand( int(h * ratio), int(w * ratio), top, left) # expand segmentation map if results.get('gt_seg_map', None) is not None: gt_seg = results['gt_seg_map'] expand_gt_seg = np.full((int(h * ratio), int(w * ratio)), self.seg_ignore_label, dtype=gt_seg.dtype) expand_gt_seg[top:top + h, left:left + w] = gt_seg results['gt_seg_map'] = expand_gt_seg return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, ' repr_str += f'ratio_range={self.ratio_range}, ' repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' repr_str += f'prob={self.prob})' return repr_str @TRANSFORMS.register_module() class MinIoURandomCrop(BaseTransform): """Random crop the image & bboxes & masks & segmentation map, the cropped patches have minimum IoU requirement with original image & bboxes & masks. & segmentation map, the IoU threshold is randomly selected from min_ious. Required Keys: - img - img_shape - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_ignore_flags (bool) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - img_shape - gt_bboxes - gt_bboxes_labels - gt_masks - gt_ignore_flags - gt_seg_map Args: min_ious (Sequence[float]): minimum IoU threshold for all intersections with bounding boxes. min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, where a >= min_crop_size). bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. """ def __init__(self, min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size: float = 0.3, bbox_clip_border: bool = True) -> None: self.min_ious = min_ious self.sample_mode = (1, *min_ious, 0) self.min_crop_size = min_crop_size self.bbox_clip_border = bbox_clip_border @cache_randomness def _random_mode(self) -> Number: return random.choice(self.sample_mode) @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to crop images and bounding boxes with minimum IoU constraint. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images and bounding boxes cropped, \ 'img_shape' key is updated. """ assert 'img' in results, '`img` is not found in results' assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results' img = results['img'] boxes = results['gt_bboxes'] h, w, c = img.shape while True: mode = self._random_mode() self.mode = mode if mode == 1: return results min_iou = self.mode for i in range(50): new_w = random.uniform(self.min_crop_size * w, w) new_h = random.uniform(self.min_crop_size * h, h) # h / w in [0.5, 2] if new_h / new_w < 0.5 or new_h / new_w > 2: continue left = random.uniform(w - new_w) top = random.uniform(h - new_h) patch = np.array( (int(left), int(top), int(left + new_w), int(top + new_h))) # Line or point crop is not allowed if patch[2] == patch[0] or patch[3] == patch[1]: continue overlaps = boxes.overlaps( HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)), boxes).numpy().reshape(-1) if len(overlaps) > 0 and overlaps.min() < min_iou: continue # center of boxes should inside the crop img # only adjust boxes and instance masks when the gt is not empty if len(overlaps) > 0: # adjust boxes def is_center_of_bboxes_in_patch(boxes, patch): centers = boxes.centers.numpy() mask = ((centers[:, 0] > patch[0]) * (centers[:, 1] > patch[1]) * (centers[:, 0] < patch[2]) * (centers[:, 1] < patch[3])) return mask mask = is_center_of_bboxes_in_patch(boxes, patch) if not mask.any(): continue if results.get('gt_bboxes', None) is not None: boxes = results['gt_bboxes'] mask = is_center_of_bboxes_in_patch(boxes, patch) boxes = boxes[mask] boxes.translate_([-patch[0], -patch[1]]) if self.bbox_clip_border: boxes.clip_( [patch[3] - patch[1], patch[2] - patch[0]]) results['gt_bboxes'] = boxes # ignore_flags if results.get('gt_ignore_flags', None) is not None: results['gt_ignore_flags'] = \ results['gt_ignore_flags'][mask] # labels if results.get('gt_bboxes_labels', None) is not None: results['gt_bboxes_labels'] = results[ 'gt_bboxes_labels'][mask] # mask fields if results.get('gt_masks', None) is not None: results['gt_masks'] = results['gt_masks'][ mask.nonzero()[0]].crop(patch) # adjust the img no matter whether the gt is empty before crop img = img[patch[1]:patch[3], patch[0]:patch[2]] results['img'] = img results['img_shape'] = img.shape[:2] # seg fields if results.get('gt_seg_map', None) is not None: results['gt_seg_map'] = results['gt_seg_map'][ patch[1]:patch[3], patch[0]:patch[2]] return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(min_ious={self.min_ious}, ' repr_str += f'min_crop_size={self.min_crop_size}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @TRANSFORMS.register_module() class Corrupt(BaseTransform): """Corruption augmentation. Corruption transforms implemented based on `imagecorruptions `_. Required Keys: - img (np.uint8) Modified Keys: - img (np.uint8) Args: corruption (str): Corruption name. severity (int): The severity of corruption. Defaults to 1. """ def __init__(self, corruption: str, severity: int = 1) -> None: self.corruption = corruption self.severity = severity def transform(self, results: dict) -> dict: """Call function to corrupt image. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images corrupted. """ if corrupt is None: raise RuntimeError('imagecorruptions is not installed') results['img'] = corrupt( results['img'].astype(np.uint8), corruption_name=self.corruption, severity=self.severity) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(corruption={self.corruption}, ' repr_str += f'severity={self.severity})' return repr_str @TRANSFORMS.register_module() @avoid_cache_randomness class Albu(BaseTransform): """Albumentation augmentation. Adds custom transformations from Albumentations library. Please, visit `https://albumentations.readthedocs.io` to get more information. Required Keys: - img (np.uint8) - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) Modified Keys: - img (np.uint8) - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - img_shape (tuple) An example of ``transforms`` is as followed: .. code-block:: [ dict( type='ShiftScaleRotate', shift_limit=0.0625, scale_limit=0.0, rotate_limit=0, interpolation=1, p=0.5), dict( type='RandomBrightnessContrast', brightness_limit=[0.1, 0.3], contrast_limit=[0.1, 0.3], p=0.2), dict(type='ChannelShuffle', p=0.1), dict( type='OneOf', transforms=[ dict(type='Blur', blur_limit=3, p=1.0), dict(type='MedianBlur', blur_limit=3, p=1.0) ], p=0.1), ] Args: transforms (list[dict]): A list of albu transformations bbox_params (dict, optional): Bbox_params for albumentation `Compose` keymap (dict, optional): Contains {'input key':'albumentation-style key'} skip_img_without_anno (bool): Whether to skip the image if no ann left after aug. Defaults to False. """ def __init__(self, transforms: List[dict], bbox_params: Optional[dict] = None, keymap: Optional[dict] = None, skip_img_without_anno: bool = False) -> None: if Compose is None: raise RuntimeError('albumentations is not installed') # Args will be modified later, copying it will be safer transforms = copy.deepcopy(transforms) if bbox_params is not None: bbox_params = copy.deepcopy(bbox_params) if keymap is not None: keymap = copy.deepcopy(keymap) self.transforms = transforms self.filter_lost_elements = False self.skip_img_without_anno = skip_img_without_anno # A simple workaround to remove masks without boxes if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params and 'filter_lost_elements' in bbox_params): self.filter_lost_elements = True self.origin_label_fields = bbox_params['label_fields'] bbox_params['label_fields'] = ['idx_mapper'] del bbox_params['filter_lost_elements'] self.bbox_params = ( self.albu_builder(bbox_params) if bbox_params else None) self.aug = Compose([self.albu_builder(t) for t in self.transforms], bbox_params=self.bbox_params) if not keymap: self.keymap_to_albu = { 'img': 'image', 'gt_masks': 'masks', 'gt_bboxes': 'bboxes' } else: self.keymap_to_albu = keymap self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()} def albu_builder(self, cfg: dict) -> albumentations: """Import a module from albumentations. It inherits some of :func:`build_from_cfg` logic. Args: cfg (dict): Config dict. It should at least contain the key "type". Returns: obj: The constructed object. """ assert isinstance(cfg, dict) and 'type' in cfg args = cfg.copy() obj_type = args.pop('type') if is_str(obj_type): if albumentations is None: raise RuntimeError('albumentations is not installed') obj_cls = getattr(albumentations, obj_type) elif inspect.isclass(obj_type): obj_cls = obj_type else: raise TypeError( f'type must be a str or valid type, but got {type(obj_type)}') if 'transforms' in args: args['transforms'] = [ self.albu_builder(transform) for transform in args['transforms'] ] return obj_cls(**args) @staticmethod def mapper(d: dict, keymap: dict) -> dict: """Dictionary mapper. Renames keys according to keymap provided. Args: d (dict): old dict keymap (dict): {'old_key':'new_key'} Returns: dict: new dict. """ updated_dict = {} for k, v in zip(d.keys(), d.values()): new_k = keymap.get(k, k) updated_dict[new_k] = d[k] return updated_dict @autocast_box_type() def transform(self, results: dict) -> Union[dict, None]: """Transform function of Albu.""" # TODO: gt_seg_map is not currently supported # dict to albumentations format results = self.mapper(results, self.keymap_to_albu) results, ori_masks = self._preprocess_results(results) results = self.aug(**results) results = self._postprocess_results(results, ori_masks) if results is None: return None # back to the original format results = self.mapper(results, self.keymap_back) results['img_shape'] = results['img'].shape return results def _preprocess_results(self, results: dict) -> tuple: """Pre-processing results to facilitate the use of Albu.""" if 'bboxes' in results: # to list of boxes if not isinstance(results['bboxes'], HorizontalBoxes): raise NotImplementedError( 'Albu only supports horizontal boxes now') bboxes = results['bboxes'].numpy() results['bboxes'] = [x for x in bboxes] # add pseudo-field for filtration if self.filter_lost_elements: results['idx_mapper'] = np.arange(len(results['bboxes'])) # TODO: Support mask structure in albu ori_masks = None if 'masks' in results: if isinstance(results['masks'], PolygonMasks): raise NotImplementedError( 'Albu only supports BitMap masks now') ori_masks = results['masks'] if albumentations.__version__ < '0.5': results['masks'] = results['masks'].masks else: results['masks'] = [mask for mask in results['masks'].masks] return results, ori_masks def _postprocess_results( self, results: dict, ori_masks: Optional[Union[BitmapMasks, PolygonMasks]] = None) -> dict: """Post-processing Albu output.""" # albumentations may return np.array or list on different versions if 'gt_bboxes_labels' in results and isinstance( results['gt_bboxes_labels'], list): results['gt_bboxes_labels'] = np.array( results['gt_bboxes_labels'], dtype=np.int64) if 'gt_ignore_flags' in results and isinstance( results['gt_ignore_flags'], list): results['gt_ignore_flags'] = np.array( results['gt_ignore_flags'], dtype=bool) if 'bboxes' in results: if isinstance(results['bboxes'], list): results['bboxes'] = np.array( results['bboxes'], dtype=np.float32) results['bboxes'] = results['bboxes'].reshape(-1, 4) results['bboxes'] = HorizontalBoxes(results['bboxes']) # filter label_fields if self.filter_lost_elements: for label in self.origin_label_fields: results[label] = np.array( [results[label][i] for i in results['idx_mapper']]) if 'masks' in results: assert ori_masks is not None results['masks'] = np.array( [results['masks'][i] for i in results['idx_mapper']]) results['masks'] = ori_masks.__class__( results['masks'], results['image'].shape[0], results['image'].shape[1]) if (not len(results['idx_mapper']) and self.skip_img_without_anno): return None elif 'masks' in results: results['masks'] = ori_masks.__class__( results['masks'], results['image'].shape[0], results['image'].shape[1]) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ + f'(transforms={self.transforms})' return repr_str @TRANSFORMS.register_module() @avoid_cache_randomness class RandomCenterCropPad(BaseTransform): """Random center crop and random around padding for CornerNet. This operation generates randomly cropped image from the original image and pads it simultaneously. Different from :class:`RandomCrop`, the output shape may not equal to ``crop_size`` strictly. We choose a random value from ``ratios`` and the output shape could be larger or smaller than ``crop_size``. The padding operation is also different from :class:`Pad`, here we use around padding instead of right-bottom padding. The relation between output image (padding image) and original image: .. code:: text output image +----------------------------+ | padded area | +------|----------------------------|----------+ | | cropped area | | | | +---------------+ | | | | | . center | | | original image | | | range | | | | | +---------------+ | | +------|----------------------------|----------+ | padded area | +----------------------------+ There are 5 main areas in the figure: - output image: output image of this operation, also called padding image in following instruction. - original image: input image of this operation. - padded area: non-intersect area of output image and original image. - cropped area: the overlap of output image and original image. - center range: a smaller area where random center chosen from. center range is computed by ``border`` and original image's shape to avoid our random center is too close to original image's border. Also this operation act differently in train and test mode, the summary pipeline is listed below. Train pipeline: 1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image will be ``random_ratio * crop_size``. 2. Choose a ``random_center`` in center range. 3. Generate padding image with center matches the ``random_center``. 4. Initialize the padding image with pixel value equals to ``mean``. 5. Copy the cropped area to padding image. 6. Refine annotations. Test pipeline: 1. Compute output shape according to ``test_pad_mode``. 2. Generate padding image with center matches the original image center. 3. Initialize the padding image with pixel value equals to ``mean``. 4. Copy the ``cropped area`` to padding image. Required Keys: - img (np.float32) - img_shape (tuple) - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) Modified Keys: - img (np.float32) - img_shape (tuple) - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) Args: crop_size (tuple, optional): expected size after crop, final size will computed according to ratio. Requires (width, height) in train mode, and None in test mode. ratios (tuple, optional): random select a ratio from tuple and crop image to (crop_size[0] * ratio) * (crop_size[1] * ratio). Only available in train mode. Defaults to (0.9, 1.0, 1.1). border (int, optional): max distance from center select area to image border. Only available in train mode. Defaults to 128. mean (sequence, optional): Mean values of 3 channels. std (sequence, optional): Std values of 3 channels. to_rgb (bool, optional): Whether to convert the image from BGR to RGB. test_mode (bool): whether involve random variables in transform. In train mode, crop_size is fixed, center coords and ratio is random selected from predefined lists. In test mode, crop_size is image's original shape, center coords and ratio is fixed. Defaults to False. test_pad_mode (tuple, optional): padding method and padding shape value, only available in test mode. Default is using 'logical_or' with 127 as padding shape value. - 'logical_or': final_shape = input_shape | padding_shape_value - 'size_divisor': final_shape = int( ceil(input_shape / padding_shape_value) * padding_shape_value) Defaults to ('logical_or', 127). test_pad_add_pix (int): Extra padding pixel in test mode. Defaults to 0. bbox_clip_border (bool): Whether clip the objects outside the border of the image. Defaults to True. """ def __init__(self, crop_size: Optional[tuple] = None, ratios: Optional[tuple] = (0.9, 1.0, 1.1), border: Optional[int] = 128, mean: Optional[Sequence] = None, std: Optional[Sequence] = None, to_rgb: Optional[bool] = None, test_mode: bool = False, test_pad_mode: Optional[tuple] = ('logical_or', 127), test_pad_add_pix: int = 0, bbox_clip_border: bool = True) -> None: if test_mode: assert crop_size is None, 'crop_size must be None in test mode' assert ratios is None, 'ratios must be None in test mode' assert border is None, 'border must be None in test mode' assert isinstance(test_pad_mode, (list, tuple)) assert test_pad_mode[0] in ['logical_or', 'size_divisor'] else: assert isinstance(crop_size, (list, tuple)) assert crop_size[0] > 0 and crop_size[1] > 0, ( 'crop_size must > 0 in train mode') assert isinstance(ratios, (list, tuple)) assert test_pad_mode is None, ( 'test_pad_mode must be None in train mode') self.crop_size = crop_size self.ratios = ratios self.border = border # We do not set default value to mean, std and to_rgb because these # hyper-parameters are easy to forget but could affect the performance. # Please use the same setting as Normalize for performance assurance. assert mean is not None and std is not None and to_rgb is not None self.to_rgb = to_rgb self.input_mean = mean self.input_std = std if to_rgb: self.mean = mean[::-1] self.std = std[::-1] else: self.mean = mean self.std = std self.test_mode = test_mode self.test_pad_mode = test_pad_mode self.test_pad_add_pix = test_pad_add_pix self.bbox_clip_border = bbox_clip_border def _get_border(self, border, size): """Get final border for the target size. This function generates a ``final_border`` according to image's shape. The area between ``final_border`` and ``size - final_border`` is the ``center range``. We randomly choose center from the ``center range`` to avoid our random center is too close to original image's border. Also ``center range`` should be larger than 0. Args: border (int): The initial border, default is 128. size (int): The width or height of original image. Returns: int: The final border. """ k = 2 * border / size i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k))) return border // i def _filter_boxes(self, patch, boxes): """Check whether the center of each box is in the patch. Args: patch (list[int]): The cropped area, [left, top, right, bottom]. boxes (numpy array, (N x 4)): Ground truth boxes. Returns: mask (numpy array, (N,)): Each box is inside or outside the patch. """ center = boxes.centers.numpy() mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * ( center[:, 0] < patch[2]) * ( center[:, 1] < patch[3]) return mask def _crop_image_and_paste(self, image, center, size): """Crop image with a given center and size, then paste the cropped image to a blank image with two centers align. This function is equivalent to generating a blank image with ``size`` as its shape. Then cover it on the original image with two centers ( the center of blank image and the random center of original image) aligned. The overlap area is paste from the original image and the outside area is filled with ``mean pixel``. Args: image (np array, H x W x C): Original image. center (list[int]): Target crop center coord. size (list[int]): Target crop size. [target_h, target_w] Returns: cropped_img (np array, target_h x target_w x C): Cropped image. border (np array, 4): The distance of four border of ``cropped_img`` to the original image area, [top, bottom, left, right] patch (list[int]): The cropped area, [left, top, right, bottom]. """ center_y, center_x = center target_h, target_w = size img_h, img_w, img_c = image.shape x0 = max(0, center_x - target_w // 2) x1 = min(center_x + target_w // 2, img_w) y0 = max(0, center_y - target_h // 2) y1 = min(center_y + target_h // 2, img_h) patch = np.array((int(x0), int(y0), int(x1), int(y1))) left, right = center_x - x0, x1 - center_x top, bottom = center_y - y0, y1 - center_y cropped_center_y, cropped_center_x = target_h // 2, target_w // 2 cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype) for i in range(img_c): cropped_img[:, :, i] += self.mean[i] y_slice = slice(cropped_center_y - top, cropped_center_y + bottom) x_slice = slice(cropped_center_x - left, cropped_center_x + right) cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :] border = np.array([ cropped_center_y - top, cropped_center_y + bottom, cropped_center_x - left, cropped_center_x + right ], dtype=np.float32) return cropped_img, border, patch def _train_aug(self, results): """Random crop and around padding the original image. Args: results (dict): Image infomations in the augment pipeline. Returns: results (dict): The updated dict. """ img = results['img'] h, w, c = img.shape gt_bboxes = results['gt_bboxes'] while True: scale = random.choice(self.ratios) new_h = int(self.crop_size[1] * scale) new_w = int(self.crop_size[0] * scale) h_border = self._get_border(self.border, h) w_border = self._get_border(self.border, w) for i in range(50): center_x = random.randint(low=w_border, high=w - w_border) center_y = random.randint(low=h_border, high=h - h_border) cropped_img, border, patch = self._crop_image_and_paste( img, [center_y, center_x], [new_h, new_w]) if len(gt_bboxes) == 0: results['img'] = cropped_img results['img_shape'] = cropped_img.shape return results # if image do not have valid bbox, any crop patch is valid. mask = self._filter_boxes(patch, gt_bboxes) if not mask.any(): continue results['img'] = cropped_img results['img_shape'] = cropped_img.shape x0, y0, x1, y1 = patch left_w, top_h = center_x - x0, center_y - y0 cropped_center_x, cropped_center_y = new_w // 2, new_h // 2 # crop bboxes accordingly and clip to the image boundary gt_bboxes = gt_bboxes[mask] gt_bboxes.translate_([ cropped_center_x - left_w - x0, cropped_center_y - top_h - y0 ]) if self.bbox_clip_border: gt_bboxes.clip_([new_h, new_w]) keep = gt_bboxes.is_inside([new_h, new_w]).numpy() gt_bboxes = gt_bboxes[keep] results['gt_bboxes'] = gt_bboxes # ignore_flags if results.get('gt_ignore_flags', None) is not None: gt_ignore_flags = results['gt_ignore_flags'][mask] results['gt_ignore_flags'] = \ gt_ignore_flags[keep] # labels if results.get('gt_bboxes_labels', None) is not None: gt_labels = results['gt_bboxes_labels'][mask] results['gt_bboxes_labels'] = gt_labels[keep] if 'gt_masks' in results or 'gt_seg_map' in results: raise NotImplementedError( 'RandomCenterCropPad only supports bbox.') return results def _test_aug(self, results): """Around padding the original image without cropping. The padding mode and value are from ``test_pad_mode``. Args: results (dict): Image infomations in the augment pipeline. Returns: results (dict): The updated dict. """ img = results['img'] h, w, c = img.shape if self.test_pad_mode[0] in ['logical_or']: # self.test_pad_add_pix is only used for centernet target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix elif self.test_pad_mode[0] in ['size_divisor']: divisor = self.test_pad_mode[1] target_h = int(np.ceil(h / divisor)) * divisor target_w = int(np.ceil(w / divisor)) * divisor else: raise NotImplementedError( 'RandomCenterCropPad only support two testing pad mode:' 'logical-or and size_divisor.') cropped_img, border, _ = self._crop_image_and_paste( img, [h // 2, w // 2], [target_h, target_w]) results['img'] = cropped_img results['img_shape'] = cropped_img.shape results['border'] = border return results @autocast_box_type() def transform(self, results: dict) -> dict: img = results['img'] assert img.dtype == np.float32, ( 'RandomCenterCropPad needs the input image of dtype np.float32,' ' please set "to_float32=True" in "LoadImageFromFile" pipeline') h, w, c = img.shape assert c == len(self.mean) if self.test_mode: return self._test_aug(results) else: return self._train_aug(results) def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(crop_size={self.crop_size}, ' repr_str += f'ratios={self.ratios}, ' repr_str += f'border={self.border}, ' repr_str += f'mean={self.input_mean}, ' repr_str += f'std={self.input_std}, ' repr_str += f'to_rgb={self.to_rgb}, ' repr_str += f'test_mode={self.test_mode}, ' repr_str += f'test_pad_mode={self.test_pad_mode}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @TRANSFORMS.register_module() class CutOut(BaseTransform): """CutOut operation. Randomly drop some regions of image used in `Cutout `_. Required Keys: - img Modified Keys: - img Args: n_holes (int or tuple[int, int]): Number of regions to be dropped. If it is given as a list, number of holes will be randomly selected from the closed interval [``n_holes[0]``, ``n_holes[1]``]. cutout_shape (tuple[int, int] or list[tuple[int, int]], optional): The candidate shape of dropped regions. It can be ``tuple[int, int]`` to use a fixed cutout shape, or ``list[tuple[int, int]]`` to randomly choose shape from the list. Defaults to None. cutout_ratio (tuple[float, float] or list[tuple[float, float]], optional): The candidate ratio of dropped regions. It can be ``tuple[float, float]`` to use a fixed ratio or ``list[tuple[float, float]]`` to randomly choose ratio from the list. Please note that ``cutout_shape`` and ``cutout_ratio`` cannot be both given at the same time. Defaults to None. fill_in (tuple[float, float, float] or tuple[int, int, int]): The value of pixel to fill in the dropped regions. Defaults to (0, 0, 0). """ def __init__( self, n_holes: Union[int, Tuple[int, int]], cutout_shape: Optional[Union[Tuple[int, int], List[Tuple[int, int]]]] = None, cutout_ratio: Optional[Union[Tuple[float, float], List[Tuple[float, float]]]] = None, fill_in: Union[Tuple[float, float, float], Tuple[int, int, int]] = (0, 0, 0) ) -> None: assert (cutout_shape is None) ^ (cutout_ratio is None), \ 'Either cutout_shape or cutout_ratio should be specified.' assert (isinstance(cutout_shape, (list, tuple)) or isinstance(cutout_ratio, (list, tuple))) if isinstance(n_holes, tuple): assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1] else: n_holes = (n_holes, n_holes) self.n_holes = n_holes self.fill_in = fill_in self.with_ratio = cutout_ratio is not None self.candidates = cutout_ratio if self.with_ratio else cutout_shape if not isinstance(self.candidates, list): self.candidates = [self.candidates] @autocast_box_type() def transform(self, results: dict) -> dict: """Call function to drop some regions of image.""" h, w, c = results['img'].shape n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1) for _ in range(n_holes): x1 = np.random.randint(0, w) y1 = np.random.randint(0, h) index = np.random.randint(0, len(self.candidates)) if not self.with_ratio: cutout_w, cutout_h = self.candidates[index] else: cutout_w = int(self.candidates[index][0] * w) cutout_h = int(self.candidates[index][1] * h) x2 = np.clip(x1 + cutout_w, 0, w) y2 = np.clip(y1 + cutout_h, 0, h) results['img'][y1:y2, x1:x2, :] = self.fill_in return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(n_holes={self.n_holes}, ' repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio else f'cutout_shape={self.candidates}, ') repr_str += f'fill_in={self.fill_in})' return repr_str @TRANSFORMS.register_module() class Mosaic(BaseTransform): """Mosaic augmentation. Given 4 images, mosaic transform combines them into one output image. The output image is composed of the parts from each sub- image. .. code:: text mosaic transform center_x +------------------------------+ | pad | pad | | +-----------+ | | | | | | | image1 |--------+ | | | | | | | | | image2 | | center_y |----+-------------+-----------| | | cropped | | |pad | image3 | image4 | | | | | +----|-------------+-----------+ | | +-------------+ The mosaic transform steps are as follows: 1. Choose the mosaic center as the intersections of 4 images 2. Get the left top image according to the index, and randomly sample another 3 images from the custom dataset. 3. Sub image will be cropped if image is larger than mosaic patch Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) - mix_results (List[dict]) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) Args: img_scale (Sequence[int]): Image size after mosaic pipeline of single image. The shape order should be (width, height). Defaults to (640, 640). center_ratio_range (Sequence[float]): Center ratio range of mosaic output. Defaults to (0.5, 1.5). bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. pad_val (int): Pad value. Defaults to 114. prob (float): Probability of applying this transformation. Defaults to 1.0. """ def __init__(self, img_scale: Tuple[int, int] = (640, 640), center_ratio_range: Tuple[float, float] = (0.5, 1.5), bbox_clip_border: bool = True, pad_val: float = 114.0, prob: float = 1.0) -> None: assert isinstance(img_scale, tuple) assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ f'got {prob}.' log_img_scale(img_scale, skip_square=True, shape_order='wh') self.img_scale = img_scale self.center_ratio_range = center_ratio_range self.bbox_clip_border = bbox_clip_border self.pad_val = pad_val self.prob = prob @cache_randomness def get_indexes(self, dataset: BaseDataset) -> int: """Call function to collect indexes. Args: dataset (:obj:`MultiImageMixDataset`): The dataset. Returns: list: indexes. """ indexes = [random.randint(0, len(dataset)) for _ in range(3)] return indexes @autocast_box_type() def transform(self, results: dict) -> dict: """Mosaic transform function. Args: results (dict): Result dict. Returns: dict: Updated result dict. """ if random.uniform(0, 1) > self.prob: return results assert 'mix_results' in results mosaic_bboxes = [] mosaic_bboxes_labels = [] mosaic_ignore_flags = [] if len(results['img'].shape) == 3: mosaic_img = np.full( (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3), self.pad_val, dtype=results['img'].dtype) else: mosaic_img = np.full( (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)), self.pad_val, dtype=results['img'].dtype) # mosaic center x, y center_x = int( random.uniform(*self.center_ratio_range) * self.img_scale[0]) center_y = int( random.uniform(*self.center_ratio_range) * self.img_scale[1]) center_position = (center_x, center_y) loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') for i, loc in enumerate(loc_strs): if loc == 'top_left': results_patch = copy.deepcopy(results) else: results_patch = copy.deepcopy(results['mix_results'][i - 1]) img_i = results_patch['img'] h_i, w_i = img_i.shape[:2] # keep_ratio resize scale_ratio_i = min(self.img_scale[1] / h_i, self.img_scale[0] / w_i) img_i = mmcv.imresize( img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) # compute the combine parameters paste_coord, crop_coord = self._mosaic_combine( loc, center_position, img_i.shape[:2][::-1]) x1_p, y1_p, x2_p, y2_p = paste_coord x1_c, y1_c, x2_c, y2_c = crop_coord # crop and paste image mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] # adjust coordinate gt_bboxes_i = results_patch['gt_bboxes'] gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] gt_ignore_flags_i = results_patch['gt_ignore_flags'] padw = x1_p - x1_c padh = y1_p - y1_c gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) gt_bboxes_i.translate_([padw, padh]) mosaic_bboxes.append(gt_bboxes_i) mosaic_bboxes_labels.append(gt_bboxes_labels_i) mosaic_ignore_flags.append(gt_ignore_flags_i) mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) if self.bbox_clip_border: mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]]) # remove outside bboxes inside_inds = mosaic_bboxes.is_inside( [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy() mosaic_bboxes = mosaic_bboxes[inside_inds] mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] results['img'] = mosaic_img results['img_shape'] = mosaic_img.shape results['gt_bboxes'] = mosaic_bboxes results['gt_bboxes_labels'] = mosaic_bboxes_labels results['gt_ignore_flags'] = mosaic_ignore_flags return results def _mosaic_combine( self, loc: str, center_position_xy: Sequence[float], img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: """Calculate global coordinate of mosaic image and local coordinate of cropped sub-image. Args: loc (str): Index for the sub-image, loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right'). center_position_xy (Sequence[float]): Mixing center for 4 images, (x, y). img_shape_wh (Sequence[int]): Width and height of sub-image Returns: tuple[tuple[float]]: Corresponding coordinate of pasting and cropping - paste_coord (tuple): paste corner coordinate in mosaic image. - crop_coord (tuple): crop corner coordinate in mosaic image. """ assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') if loc == 'top_left': # index0 to top left part of image x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ max(center_position_xy[1] - img_shape_wh[1], 0), \ center_position_xy[0], \ center_position_xy[1] crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( y2 - y1), img_shape_wh[0], img_shape_wh[1] elif loc == 'top_right': # index1 to top right part of image x1, y1, x2, y2 = center_position_xy[0], \ max(center_position_xy[1] - img_shape_wh[1], 0), \ min(center_position_xy[0] + img_shape_wh[0], self.img_scale[0] * 2), \ center_position_xy[1] crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( img_shape_wh[0], x2 - x1), img_shape_wh[1] elif loc == 'bottom_left': # index2 to bottom left part of image x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ center_position_xy[1], \ center_position_xy[0], \ min(self.img_scale[1] * 2, center_position_xy[1] + img_shape_wh[1]) crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( y2 - y1, img_shape_wh[1]) else: # index3 to bottom right part of image x1, y1, x2, y2 = center_position_xy[0], \ center_position_xy[1], \ min(center_position_xy[0] + img_shape_wh[0], self.img_scale[0] * 2), \ min(self.img_scale[1] * 2, center_position_xy[1] + img_shape_wh[1]) crop_coord = 0, 0, min(img_shape_wh[0], x2 - x1), min(y2 - y1, img_shape_wh[1]) paste_coord = x1, y1, x2, y2 return paste_coord, crop_coord def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(img_scale={self.img_scale}, ' repr_str += f'center_ratio_range={self.center_ratio_range}, ' repr_str += f'pad_val={self.pad_val}, ' repr_str += f'prob={self.prob})' return repr_str @TRANSFORMS.register_module() class MixUp(BaseTransform): """MixUp data augmentation. .. code:: text mixup transform +------------------------------+ | mixup image | | | +--------|--------+ | | | | | | |---------------+ | | | | | | | | image | | | | | | | | | | | |-----------------+ | | pad | +------------------------------+ The mixup transform steps are as follows: 1. Another random image is picked by dataset and embedded in the top left patch(after padding and resizing) 2. The target of mixup transform is the weighted average of mixup image and origin image. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) - mix_results (List[dict]) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) Args: img_scale (Sequence[int]): Image output size after mixup pipeline. The shape order should be (width, height). Defaults to (640, 640). ratio_range (Sequence[float]): Scale ratio of mixup image. Defaults to (0.5, 1.5). flip_ratio (float): Horizontal flip ratio of mixup image. Defaults to 0.5. pad_val (int): Pad value. Defaults to 114. max_iters (int): The maximum number of iterations. If the number of iterations is greater than `max_iters`, but gt_bbox is still empty, then the iteration is terminated. Defaults to 15. bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. """ def __init__(self, img_scale: Tuple[int, int] = (640, 640), ratio_range: Tuple[float, float] = (0.5, 1.5), flip_ratio: float = 0.5, pad_val: float = 114.0, max_iters: int = 15, bbox_clip_border: bool = True) -> None: assert isinstance(img_scale, tuple) log_img_scale(img_scale, skip_square=True, shape_order='wh') self.dynamic_scale = img_scale self.ratio_range = ratio_range self.flip_ratio = flip_ratio self.pad_val = pad_val self.max_iters = max_iters self.bbox_clip_border = bbox_clip_border @cache_randomness def get_indexes(self, dataset: BaseDataset) -> int: """Call function to collect indexes. Args: dataset (:obj:`MultiImageMixDataset`): The dataset. Returns: list: indexes. """ for i in range(self.max_iters): index = random.randint(0, len(dataset)) gt_bboxes_i = dataset[index]['gt_bboxes'] if len(gt_bboxes_i) != 0: break return index @autocast_box_type() def transform(self, results: dict) -> dict: """MixUp transform function. Args: results (dict): Result dict. Returns: dict: Updated result dict. """ assert 'mix_results' in results assert len( results['mix_results']) == 1, 'MixUp only support 2 images now !' if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: # empty bbox return results retrieve_results = results['mix_results'][0] retrieve_img = retrieve_results['img'] jit_factor = random.uniform(*self.ratio_range) is_filp = random.uniform(0, 1) > self.flip_ratio if len(retrieve_img.shape) == 3: out_img = np.ones( (self.dynamic_scale[1], self.dynamic_scale[0], 3), dtype=retrieve_img.dtype) * self.pad_val else: out_img = np.ones( self.dynamic_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val # 1. keep_ratio resize scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0], self.dynamic_scale[0] / retrieve_img.shape[1]) retrieve_img = mmcv.imresize( retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), int(retrieve_img.shape[0] * scale_ratio))) # 2. paste out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img # 3. scale jit scale_ratio *= jit_factor out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), int(out_img.shape[0] * jit_factor))) # 4. flip if is_filp: out_img = out_img[:, ::-1, :] # 5. random crop ori_img = results['img'] origin_h, origin_w = out_img.shape[:2] target_h, target_w = ori_img.shape[:2] padded_img = np.ones((max(origin_h, target_h), max( origin_w, target_w), 3)) * self.pad_val padded_img = padded_img.astype(np.uint8) padded_img[:origin_h, :origin_w] = out_img x_offset, y_offset = 0, 0 if padded_img.shape[0] > target_h: y_offset = random.randint(0, padded_img.shape[0] - target_h) if padded_img.shape[1] > target_w: x_offset = random.randint(0, padded_img.shape[1] - target_w) padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:x_offset + target_w] # 6. adjust bbox retrieve_gt_bboxes = retrieve_results['gt_bboxes'] retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) if self.bbox_clip_border: retrieve_gt_bboxes.clip_([origin_h, origin_w]) if is_filp: retrieve_gt_bboxes.flip_([origin_h, origin_w], direction='horizontal') # 7. filter cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) if self.bbox_clip_border: cp_retrieve_gt_bboxes.clip_([target_h, target_w]) # 8. mix up ori_img = ori_img.astype(np.float32) mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32) retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) mixup_gt_bboxes_labels = np.concatenate( (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) mixup_gt_ignore_flags = np.concatenate( (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) # remove outside bbox inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy() mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] results['img'] = mixup_img.astype(np.uint8) results['img_shape'] = mixup_img.shape results['gt_bboxes'] = mixup_gt_bboxes results['gt_bboxes_labels'] = mixup_gt_bboxes_labels results['gt_ignore_flags'] = mixup_gt_ignore_flags return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(dynamic_scale={self.dynamic_scale}, ' repr_str += f'ratio_range={self.ratio_range}, ' repr_str += f'flip_ratio={self.flip_ratio}, ' repr_str += f'pad_val={self.pad_val}, ' repr_str += f'max_iters={self.max_iters}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @TRANSFORMS.register_module() class RandomAffine(BaseTransform): """Random affine transform data augmentation. This operation randomly generates affine transform matrix which including rotation, translation, shear and scaling transforms. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) Args: max_rotate_degree (float): Maximum degrees of rotation transform. Defaults to 10. max_translate_ratio (float): Maximum ratio of translation. Defaults to 0.1. scaling_ratio_range (tuple[float]): Min and max ratio of scaling transform. Defaults to (0.5, 1.5). max_shear_degree (float): Maximum degrees of shear transform. Defaults to 2. border (tuple[int]): Distance from width and height sides of input image to adjust output shape. Only used in mosaic dataset. Defaults to (0, 0). border_val (tuple[int]): Border padding values of 3 channels. Defaults to (114, 114, 114). bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. """ def __init__(self, max_rotate_degree: float = 10.0, max_translate_ratio: float = 0.1, scaling_ratio_range: Tuple[float, float] = (0.5, 1.5), max_shear_degree: float = 2.0, border: Tuple[int, int] = (0, 0), border_val: Tuple[int, int, int] = (114, 114, 114), bbox_clip_border: bool = True) -> None: assert 0 <= max_translate_ratio <= 1 assert scaling_ratio_range[0] <= scaling_ratio_range[1] assert scaling_ratio_range[0] > 0 self.max_rotate_degree = max_rotate_degree self.max_translate_ratio = max_translate_ratio self.scaling_ratio_range = scaling_ratio_range self.max_shear_degree = max_shear_degree self.border = border self.border_val = border_val self.bbox_clip_border = bbox_clip_border @cache_randomness def _get_random_homography_matrix(self, height, width): # Rotation rotation_degree = random.uniform(-self.max_rotate_degree, self.max_rotate_degree) rotation_matrix = self._get_rotation_matrix(rotation_degree) # Scaling scaling_ratio = random.uniform(self.scaling_ratio_range[0], self.scaling_ratio_range[1]) scaling_matrix = self._get_scaling_matrix(scaling_ratio) # Shear x_degree = random.uniform(-self.max_shear_degree, self.max_shear_degree) y_degree = random.uniform(-self.max_shear_degree, self.max_shear_degree) shear_matrix = self._get_shear_matrix(x_degree, y_degree) # Translation trans_x = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * width trans_y = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * height translate_matrix = self._get_translation_matrix(trans_x, trans_y) warp_matrix = ( translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix) return warp_matrix @autocast_box_type() def transform(self, results: dict) -> dict: img = results['img'] height = img.shape[0] + self.border[1] * 2 width = img.shape[1] + self.border[0] * 2 warp_matrix = self._get_random_homography_matrix(height, width) img = cv2.warpPerspective( img, warp_matrix, dsize=(width, height), borderValue=self.border_val) results['img'] = img results['img_shape'] = img.shape bboxes = results['gt_bboxes'] num_bboxes = len(bboxes) if num_bboxes: bboxes.project_(warp_matrix) if self.bbox_clip_border: bboxes.clip_([height, width]) # remove outside bbox valid_index = bboxes.is_inside([height, width]).numpy() results['gt_bboxes'] = bboxes[valid_index] results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ valid_index] results['gt_ignore_flags'] = results['gt_ignore_flags'][ valid_index] if 'gt_masks' in results: raise NotImplementedError('RandomAffine only supports bbox.') return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(max_rotate_degree={self.max_rotate_degree}, ' repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, ' repr_str += f'max_shear_degree={self.max_shear_degree}, ' repr_str += f'border={self.border}, ' repr_str += f'border_val={self.border_val}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' return repr_str @staticmethod def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray: radian = math.radians(rotate_degrees) rotation_matrix = np.array( [[np.cos(radian), -np.sin(radian), 0.], [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]], dtype=np.float32) return rotation_matrix @staticmethod def _get_scaling_matrix(scale_ratio: float) -> np.ndarray: scaling_matrix = np.array( [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], dtype=np.float32) return scaling_matrix @staticmethod def _get_shear_matrix(x_shear_degrees: float, y_shear_degrees: float) -> np.ndarray: x_radian = math.radians(x_shear_degrees) y_radian = math.radians(y_shear_degrees) shear_matrix = np.array([[1, np.tan(x_radian), 0.], [np.tan(y_radian), 1, 0.], [0., 0., 1.]], dtype=np.float32) return shear_matrix @staticmethod def _get_translation_matrix(x: float, y: float) -> np.ndarray: translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], dtype=np.float32) return translation_matrix @TRANSFORMS.register_module() class YOLOXHSVRandomAug(BaseTransform): """Apply HSV augmentation to image sequentially. It is referenced from https://github.com/Megvii- BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21. Required Keys: - img Modified Keys: - img Args: hue_delta (int): delta of hue. Defaults to 5. saturation_delta (int): delta of saturation. Defaults to 30. value_delta (int): delat of value. Defaults to 30. """ def __init__(self, hue_delta: int = 5, saturation_delta: int = 30, value_delta: int = 30) -> None: self.hue_delta = hue_delta self.saturation_delta = saturation_delta self.value_delta = value_delta @cache_randomness def _get_hsv_gains(self): hsv_gains = np.random.uniform(-1, 1, 3) * [ self.hue_delta, self.saturation_delta, self.value_delta ] # random selection of h, s, v hsv_gains *= np.random.randint(0, 2, 3) # prevent overflow hsv_gains = hsv_gains.astype(np.int16) return hsv_gains def transform(self, results: dict) -> dict: img = results['img'] hsv_gains = self._get_hsv_gains() img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16) img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180 img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255) img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255) cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(hue_delta={self.hue_delta}, ' repr_str += f'saturation_delta={self.saturation_delta}, ' repr_str += f'value_delta={self.value_delta})' return repr_str @TRANSFORMS.register_module() class CopyPaste(BaseTransform): """Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation The simple copy-paste transform steps are as follows: 1. The destination image is already resized with aspect ratio kept, cropped and padded. 2. Randomly select a source image, which is also already resized with aspect ratio kept, cropped and padded in a similar way as the destination image. 3. Randomly select some objects from the source image. 4. Paste these source objects to the destination image directly, due to the source and destination image have the same size. 5. Update object masks of the destination image, for some origin objects may be occluded. 6. Generate bboxes from the updated destination masks and filter some objects which are totally occluded, and adjust bboxes which are partly occluded. 7. Append selected source bboxes, masks, and labels. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) - gt_masks (BitmapMasks) (optional) Modified Keys: - img - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) - gt_masks (optional) Args: max_num_pasted (int): The maximum number of pasted objects. Defaults to 100. bbox_occluded_thr (int): The threshold of occluded bbox. Defaults to 10. mask_occluded_thr (int): The threshold of occluded mask. Defaults to 300. selected (bool): Whether select objects or not. If select is False, all objects of the source image will be pasted to the destination image. Defaults to True. """ def __init__( self, max_num_pasted: int = 100, bbox_occluded_thr: int = 10, mask_occluded_thr: int = 300, selected: bool = True, ) -> None: self.max_num_pasted = max_num_pasted self.bbox_occluded_thr = bbox_occluded_thr self.mask_occluded_thr = mask_occluded_thr self.selected = selected @cache_randomness def get_indexes(self, dataset: BaseDataset) -> int: """Call function to collect indexes.s. Args: dataset (:obj:`MultiImageMixDataset`): The dataset. Returns: list: Indexes. """ return random.randint(0, len(dataset)) @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to make a copy-paste of image. Args: results (dict): Result dict. Returns: dict: Result dict with copy-paste transformed. """ assert 'mix_results' in results num_images = len(results['mix_results']) assert num_images == 1, \ f'CopyPaste only supports processing 2 images, got {num_images}' if self.selected: selected_results = self._select_object(results['mix_results'][0]) else: selected_results = results['mix_results'][0] return self._copy_paste(results, selected_results) @cache_randomness def _get_selected_inds(self, num_bboxes: int) -> np.ndarray: max_num_pasted = min(num_bboxes + 1, self.max_num_pasted) num_pasted = np.random.randint(0, max_num_pasted) return np.random.choice(num_bboxes, size=num_pasted, replace=False) def _select_object(self, results: dict) -> dict: """Select some objects from the source results.""" bboxes = results['gt_bboxes'] labels = results['gt_bboxes_labels'] masks = results['gt_masks'] ignore_flags = results['gt_ignore_flags'] selected_inds = self._get_selected_inds(bboxes.shape[0]) selected_bboxes = bboxes[selected_inds] selected_labels = labels[selected_inds] selected_masks = masks[selected_inds] selected_ignore_flags = ignore_flags[selected_inds] results['gt_bboxes'] = selected_bboxes results['gt_bboxes_labels'] = selected_labels results['gt_masks'] = selected_masks results['gt_ignore_flags'] = selected_ignore_flags return results def _copy_paste(self, dst_results: dict, src_results: dict) -> dict: """CopyPaste transform function. Args: dst_results (dict): Result dict of the destination image. src_results (dict): Result dict of the source image. Returns: dict: Updated result dict. """ dst_img = dst_results['img'] dst_bboxes = dst_results['gt_bboxes'] dst_labels = dst_results['gt_bboxes_labels'] dst_masks = dst_results['gt_masks'] dst_ignore_flags = dst_results['gt_ignore_flags'] src_img = src_results['img'] src_bboxes = src_results['gt_bboxes'] src_labels = src_results['gt_bboxes_labels'] src_masks = src_results['gt_masks'] src_ignore_flags = src_results['gt_ignore_flags'] if len(src_bboxes) == 0: return dst_results # update masks and generate bboxes from updated masks composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0) updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask) updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes)) assert len(updated_dst_bboxes) == len(updated_dst_masks) # filter totally occluded objects l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs() bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all( dim=-1).numpy() masks_inds = updated_dst_masks.masks.sum( axis=(1, 2)) > self.mask_occluded_thr valid_inds = bboxes_inds | masks_inds # Paste source objects to destination image directly img = dst_img * (1 - composed_mask[..., np.newaxis] ) + src_img * composed_mask[..., np.newaxis] bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes]) labels = np.concatenate([dst_labels[valid_inds], src_labels]) masks = np.concatenate( [updated_dst_masks.masks[valid_inds], src_masks.masks]) ignore_flags = np.concatenate( [dst_ignore_flags[valid_inds], src_ignore_flags]) dst_results['img'] = img dst_results['gt_bboxes'] = bboxes dst_results['gt_bboxes_labels'] = labels dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1], masks.shape[2]) dst_results['gt_ignore_flags'] = ignore_flags return dst_results def _get_updated_masks(self, masks: BitmapMasks, composed_mask: np.ndarray) -> BitmapMasks: """Update masks with composed mask.""" assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \ 'Cannot compare two arrays of different size' masks.masks = np.where(composed_mask, 0, masks.masks) return masks def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(max_num_pasted={self.max_num_pasted}, ' repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, ' repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, ' repr_str += f'selected={self.selected})' return repr_str @TRANSFORMS.register_module() class RandomErasing(BaseTransform): """RandomErasing operation. Random Erasing randomly selects a rectangle region in an image and erases its pixels with random values. `RandomErasing `_. Required Keys: - img - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) - gt_masks (BitmapMasks) (optional) Modified Keys: - img - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) - gt_masks (optional) Args: n_patches (int or tuple[int, int]): Number of regions to be dropped. If it is given as a tuple, number of patches will be randomly selected from the closed interval [``n_patches[0]``, ``n_patches[1]``]. ratio (float or tuple[float, float]): The ratio of erased regions. It can be ``float`` to use a fixed ratio or ``tuple[float, float]`` to randomly choose ratio from the interval. squared (bool): Whether to erase square region. Defaults to True. bbox_erased_thr (float): The threshold for the maximum area proportion of the bbox to be erased. When the proportion of the area where the bbox is erased is greater than the threshold, the bbox will be removed. Defaults to 0.9. img_border_value (int or float or tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. """ def __init__( self, n_patches: Union[int, Tuple[int, int]], ratio: Union[float, Tuple[float, float]], squared: bool = True, bbox_erased_thr: float = 0.9, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, ) -> None: if isinstance(n_patches, tuple): assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1] else: n_patches = (n_patches, n_patches) if isinstance(ratio, tuple): assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1 else: ratio = (ratio, ratio) self.n_patches = n_patches self.ratio = ratio self.squared = squared self.bbox_erased_thr = bbox_erased_thr self.img_border_value = img_border_value self.mask_border_value = mask_border_value self.seg_ignore_label = seg_ignore_label @cache_randomness def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]: """Get patches for random erasing.""" patches = [] n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1) for _ in range(n_patches): if self.squared: ratio = np.random.random() * (self.ratio[1] - self.ratio[0]) + self.ratio[0] ratio = (ratio, ratio) else: ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) + self.ratio[0], np.random.random() * (self.ratio[1] - self.ratio[0]) + self.ratio[0]) ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1]) px1, py1 = np.random.randint(0, img_shape[1] - pw), np.random.randint( 0, img_shape[0] - ph) px2, py2 = px1 + pw, py1 + ph patches.append([px1, py1, px2, py2]) return np.array(patches) def _transform_img(self, results: dict, patches: List[list]) -> None: """Random erasing the image.""" for patch in patches: px1, py1, px2, py2 = patch results['img'][py1:py2, px1:px2, :] = self.img_border_value def _transform_bboxes(self, results: dict, patches: List[list]) -> None: """Random erasing the bboxes.""" bboxes = results['gt_bboxes'] # TODO: unify the logic by using operators in BaseBoxes. assert isinstance(bboxes, HorizontalBoxes) bboxes = bboxes.numpy() left_top = np.maximum(bboxes[:, None, :2], patches[:, :2]) right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:]) wh = np.maximum(right_bottom - left_top, 0) inter_areas = wh[:, :, 0] * wh[:, :, 1] bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * ( bboxes[:, 3] - bboxes[:, 1]) bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7) valid_inds = bboxes_erased_ratio < self.bbox_erased_thr results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds]) results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds] results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds] if results.get('gt_masks', None) is not None: results['gt_masks'] = results['gt_masks'][valid_inds] def _transform_masks(self, results: dict, patches: List[list]) -> None: """Random erasing the masks.""" for patch in patches: px1, py1, px2, py2 = patch results['gt_masks'].masks[:, py1:py2, px1:px2] = self.mask_border_value def _transform_seg(self, results: dict, patches: List[list]) -> None: """Random erasing the segmentation map.""" for patch in patches: px1, py1, px2, py2 = patch results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function to erase some regions of image.""" patches = self._get_patches(results['img_shape']) self._transform_img(results, patches) if results.get('gt_bboxes', None) is not None: self._transform_bboxes(results, patches) if results.get('gt_masks', None) is not None: self._transform_masks(results, patches) if results.get('gt_seg_map', None) is not None: self._transform_seg(results, patches) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(n_patches={self.n_patches}, ' repr_str += f'ratio={self.ratio}, ' repr_str += f'squared={self.squared}, ' repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, ' repr_str += f'img_border_value={self.img_border_value}, ' repr_str += f'mask_border_value={self.mask_border_value}, ' repr_str += f'seg_ignore_label={self.seg_ignore_label})' return repr_str @TRANSFORMS.register_module() class CachedMosaic(Mosaic): """Cached mosaic augmentation. Cached mosaic transform will random select images from the cache and combine them into one output image. .. code:: text mosaic transform center_x +------------------------------+ | pad | pad | | +-----------+ | | | | | | | image1 |--------+ | | | | | | | | | image2 | | center_y |----+-------------+-----------| | | cropped | | |pad | image3 | image4 | | | | | +----|-------------+-----------+ | | +-------------+ The cached mosaic transform steps are as follows: 1. Append the results from the last transform into the cache. 2. Choose the mosaic center as the intersections of 4 images 3. Get the left top image according to the index, and randomly sample another 3 images from the result cache. 4. Sub image will be cropped if image is larger than mosaic patch Required Keys: - img - gt_bboxes (np.float32) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) Args: img_scale (Sequence[int]): Image size after mosaic pipeline of single image. The shape order should be (width, height). Defaults to (640, 640). center_ratio_range (Sequence[float]): Center ratio range of mosaic output. Defaults to (0.5, 1.5). bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. pad_val (int): Pad value. Defaults to 114. prob (float): Probability of applying this transformation. Defaults to 1.0. max_cached_images (int): The maximum length of the cache. The larger the cache, the stronger the randomness of this transform. As a rule of thumb, providing 10 caches for each image suffices for randomness. Defaults to 40. random_pop (bool): Whether to randomly pop a result from the cache when the cache is full. If set to False, use FIFO popping method. Defaults to True. """ def __init__(self, *args, max_cached_images: int = 40, random_pop: bool = True, **kwargs) -> None: super().__init__(*args, **kwargs) self.results_cache = [] self.random_pop = random_pop assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ f'but got {max_cached_images}.' self.max_cached_images = max_cached_images @cache_randomness def get_indexes(self, cache: list) -> list: """Call function to collect indexes. Args: cache (list): The results cache. Returns: list: indexes. """ indexes = [random.randint(0, len(cache) - 1) for _ in range(3)] return indexes @autocast_box_type() def transform(self, results: dict) -> dict: """Mosaic transform function. Args: results (dict): Result dict. Returns: dict: Updated result dict. """ # cache and pop images self.results_cache.append(copy.deepcopy(results)) if len(self.results_cache) > self.max_cached_images: if self.random_pop: index = random.randint(0, len(self.results_cache) - 1) else: index = 0 self.results_cache.pop(index) if len(self.results_cache) <= 4: return results if random.uniform(0, 1) > self.prob: return results indices = self.get_indexes(self.results_cache) mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices] # TODO: refactor mosaic to reuse these code. mosaic_bboxes = [] mosaic_bboxes_labels = [] mosaic_ignore_flags = [] mosaic_masks = [] with_mask = True if 'gt_masks' in results else False if len(results['img'].shape) == 3: mosaic_img = np.full( (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3), self.pad_val, dtype=results['img'].dtype) else: mosaic_img = np.full( (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)), self.pad_val, dtype=results['img'].dtype) # mosaic center x, y center_x = int( random.uniform(*self.center_ratio_range) * self.img_scale[0]) center_y = int( random.uniform(*self.center_ratio_range) * self.img_scale[1]) center_position = (center_x, center_y) loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') for i, loc in enumerate(loc_strs): if loc == 'top_left': results_patch = copy.deepcopy(results) else: results_patch = copy.deepcopy(mix_results[i - 1]) img_i = results_patch['img'] h_i, w_i = img_i.shape[:2] # keep_ratio resize scale_ratio_i = min(self.img_scale[1] / h_i, self.img_scale[0] / w_i) img_i = mmcv.imresize( img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) # compute the combine parameters paste_coord, crop_coord = self._mosaic_combine( loc, center_position, img_i.shape[:2][::-1]) x1_p, y1_p, x2_p, y2_p = paste_coord x1_c, y1_c, x2_c, y2_c = crop_coord # crop and paste image mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] # adjust coordinate gt_bboxes_i = results_patch['gt_bboxes'] gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] gt_ignore_flags_i = results_patch['gt_ignore_flags'] padw = x1_p - x1_c padh = y1_p - y1_c gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) gt_bboxes_i.translate_([padw, padh]) mosaic_bboxes.append(gt_bboxes_i) mosaic_bboxes_labels.append(gt_bboxes_labels_i) mosaic_ignore_flags.append(gt_ignore_flags_i) if with_mask and results_patch.get('gt_masks', None) is not None: gt_masks_i = results_patch['gt_masks'] gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) gt_masks_i = gt_masks_i.translate( out_shape=(int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)), offset=padw, direction='horizontal') gt_masks_i = gt_masks_i.translate( out_shape=(int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)), offset=padh, direction='vertical') mosaic_masks.append(gt_masks_i) mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) if self.bbox_clip_border: mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]]) # remove outside bboxes inside_inds = mosaic_bboxes.is_inside( [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy() mosaic_bboxes = mosaic_bboxes[inside_inds] mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] results['img'] = mosaic_img results['img_shape'] = mosaic_img.shape results['gt_bboxes'] = mosaic_bboxes results['gt_bboxes_labels'] = mosaic_bboxes_labels results['gt_ignore_flags'] = mosaic_ignore_flags if with_mask: mosaic_masks = mosaic_masks[0].cat(mosaic_masks) results['gt_masks'] = mosaic_masks[inside_inds] return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(img_scale={self.img_scale}, ' repr_str += f'center_ratio_range={self.center_ratio_range}, ' repr_str += f'pad_val={self.pad_val}, ' repr_str += f'prob={self.prob}, ' repr_str += f'max_cached_images={self.max_cached_images}, ' repr_str += f'random_pop={self.random_pop})' return repr_str @TRANSFORMS.register_module() class CachedMixUp(BaseTransform): """Cached mixup data augmentation. .. code:: text mixup transform +------------------------------+ | mixup image | | | +--------|--------+ | | | | | | |---------------+ | | | | | | | | image | | | | | | | | | | | |-----------------+ | | pad | +------------------------------+ The cached mixup transform steps are as follows: 1. Append the results from the last transform into the cache. 2. Another random image is picked from the cache and embedded in the top left patch(after padding and resizing) 3. The target of mixup transform is the weighted average of mixup image and origin image. Required Keys: - img - gt_bboxes (np.float32) (optional) - gt_bboxes_labels (np.int64) (optional) - gt_ignore_flags (bool) (optional) - mix_results (List[dict]) Modified Keys: - img - img_shape - gt_bboxes (optional) - gt_bboxes_labels (optional) - gt_ignore_flags (optional) Args: img_scale (Sequence[int]): Image output size after mixup pipeline. The shape order should be (width, height). Defaults to (640, 640). ratio_range (Sequence[float]): Scale ratio of mixup image. Defaults to (0.5, 1.5). flip_ratio (float): Horizontal flip ratio of mixup image. Defaults to 0.5. pad_val (int): Pad value. Defaults to 114. max_iters (int): The maximum number of iterations. If the number of iterations is greater than `max_iters`, but gt_bbox is still empty, then the iteration is terminated. Defaults to 15. bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. max_cached_images (int): The maximum length of the cache. The larger the cache, the stronger the randomness of this transform. As a rule of thumb, providing 10 caches for each image suffices for randomness. Defaults to 20. random_pop (bool): Whether to randomly pop a result from the cache when the cache is full. If set to False, use FIFO popping method. Defaults to True. prob (float): Probability of applying this transformation. Defaults to 1.0. """ def __init__(self, img_scale: Tuple[int, int] = (640, 640), ratio_range: Tuple[float, float] = (0.5, 1.5), flip_ratio: float = 0.5, pad_val: float = 114.0, max_iters: int = 15, bbox_clip_border: bool = True, max_cached_images: int = 20, random_pop: bool = True, prob: float = 1.0) -> None: assert isinstance(img_scale, tuple) assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ f'but got {max_cached_images}.' assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ f'got {prob}.' self.dynamic_scale = img_scale self.ratio_range = ratio_range self.flip_ratio = flip_ratio self.pad_val = pad_val self.max_iters = max_iters self.bbox_clip_border = bbox_clip_border self.results_cache = [] self.max_cached_images = max_cached_images self.random_pop = random_pop self.prob = prob @cache_randomness def get_indexes(self, cache: list) -> int: """Call function to collect indexes. Args: cache (list): The result cache. Returns: int: index. """ for i in range(self.max_iters): index = random.randint(0, len(cache) - 1) gt_bboxes_i = cache[index]['gt_bboxes'] if len(gt_bboxes_i) != 0: break return index @autocast_box_type() def transform(self, results: dict) -> dict: """MixUp transform function. Args: results (dict): Result dict. Returns: dict: Updated result dict. """ # cache and pop images self.results_cache.append(copy.deepcopy(results)) if len(self.results_cache) > self.max_cached_images: if self.random_pop: index = random.randint(0, len(self.results_cache) - 1) else: index = 0 self.results_cache.pop(index) if len(self.results_cache) <= 1: return results if random.uniform(0, 1) > self.prob: return results index = self.get_indexes(self.results_cache) retrieve_results = copy.deepcopy(self.results_cache[index]) # TODO: refactor mixup to reuse these code. if retrieve_results['gt_bboxes'].shape[0] == 0: # empty bbox return results retrieve_img = retrieve_results['img'] with_mask = True if 'gt_masks' in results else False jit_factor = random.uniform(*self.ratio_range) is_filp = random.uniform(0, 1) > self.flip_ratio if len(retrieve_img.shape) == 3: out_img = np.ones( (self.dynamic_scale[1], self.dynamic_scale[0], 3), dtype=retrieve_img.dtype) * self.pad_val else: out_img = np.ones( self.dynamic_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val # 1. keep_ratio resize scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0], self.dynamic_scale[0] / retrieve_img.shape[1]) retrieve_img = mmcv.imresize( retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), int(retrieve_img.shape[0] * scale_ratio))) # 2. paste out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img # 3. scale jit scale_ratio *= jit_factor out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), int(out_img.shape[0] * jit_factor))) # 4. flip if is_filp: out_img = out_img[:, ::-1, :] # 5. random crop ori_img = results['img'] origin_h, origin_w = out_img.shape[:2] target_h, target_w = ori_img.shape[:2] padded_img = np.ones((max(origin_h, target_h), max( origin_w, target_w), 3)) * self.pad_val padded_img = padded_img.astype(np.uint8) padded_img[:origin_h, :origin_w] = out_img x_offset, y_offset = 0, 0 if padded_img.shape[0] > target_h: y_offset = random.randint(0, padded_img.shape[0] - target_h) if padded_img.shape[1] > target_w: x_offset = random.randint(0, padded_img.shape[1] - target_w) padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:x_offset + target_w] # 6. adjust bbox retrieve_gt_bboxes = retrieve_results['gt_bboxes'] retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) if with_mask: retrieve_gt_masks = retrieve_results['gt_masks'].rescale( scale_ratio) if self.bbox_clip_border: retrieve_gt_bboxes.clip_([origin_h, origin_w]) if is_filp: retrieve_gt_bboxes.flip_([origin_h, origin_w], direction='horizontal') if with_mask: retrieve_gt_masks = retrieve_gt_masks.flip() # 7. filter cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) if with_mask: retrieve_gt_masks = retrieve_gt_masks.translate( out_shape=(target_h, target_w), offset=-x_offset, direction='horizontal') retrieve_gt_masks = retrieve_gt_masks.translate( out_shape=(target_h, target_w), offset=-y_offset, direction='vertical') if self.bbox_clip_border: cp_retrieve_gt_bboxes.clip_([target_h, target_w]) # 8. mix up ori_img = ori_img.astype(np.float32) mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32) retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) mixup_gt_bboxes_labels = np.concatenate( (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) mixup_gt_ignore_flags = np.concatenate( (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) if with_mask: mixup_gt_masks = retrieve_gt_masks.cat( [results['gt_masks'], retrieve_gt_masks]) # remove outside bbox inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy() mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] if with_mask: mixup_gt_masks = mixup_gt_masks[inside_inds] results['img'] = mixup_img.astype(np.uint8) results['img_shape'] = mixup_img.shape results['gt_bboxes'] = mixup_gt_bboxes results['gt_bboxes_labels'] = mixup_gt_bboxes_labels results['gt_ignore_flags'] = mixup_gt_ignore_flags if with_mask: results['gt_masks'] = mixup_gt_masks return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(dynamic_scale={self.dynamic_scale}, ' repr_str += f'ratio_range={self.ratio_range}, ' repr_str += f'flip_ratio={self.flip_ratio}, ' repr_str += f'pad_val={self.pad_val}, ' repr_str += f'max_iters={self.max_iters}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border}, ' repr_str += f'max_cached_images={self.max_cached_images}, ' repr_str += f'random_pop={self.random_pop}, ' repr_str += f'prob={self.prob})' return repr_str