#!/usr/bin/env python3 import math import numpy as np import random import torch import torchvision.transforms.functional as F from PIL import Image from torchvision import transforms from .rand_augment import rand_augment_transform from .random_erasing import RandomErasing import numbers import PIL import torchvision import vbench.third_party.umt.functional as FF _pil_interpolation_to_str = { Image.NEAREST: "PIL.Image.NEAREST", Image.BILINEAR: "PIL.Image.BILINEAR", Image.BICUBIC: "PIL.Image.BICUBIC", Image.LANCZOS: "PIL.Image.LANCZOS", Image.HAMMING: "PIL.Image.HAMMING", Image.BOX: "PIL.Image.BOX", } _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) def _pil_interp(method): if method == "bicubic": return Image.BICUBIC elif method == "lanczos": return Image.LANCZOS elif method == "hamming": return Image.HAMMING else: return Image.BILINEAR def random_short_side_scale_jitter( images, min_size, max_size, boxes=None, inverse_uniform_sampling=False ): """ Perform a spatial short scale jittering on the given images and corresponding boxes. Args: images (tensor): images to perform scale jitter. Dimension is `num frames` x `channel` x `height` x `width`. min_size (int): the minimal size to scale the frames. max_size (int): the maximal size to scale the frames. boxes (ndarray): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. Returns: (tensor): the scaled images with dimension of `num frames` x `channel` x `new height` x `new width`. (ndarray or None): the scaled boxes with dimension of `num boxes` x 4. """ if inverse_uniform_sampling: size = int( round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)) ) else: size = int(round(np.random.uniform(min_size, max_size))) height = images.shape[2] width = images.shape[3] if (width <= height and width == size) or ( height <= width and height == size ): return images, boxes new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) if boxes is not None: boxes = boxes * float(new_height) / height else: new_width = int(math.floor((float(width) / height) * size)) if boxes is not None: boxes = boxes * float(new_width) / width return ( torch.nn.functional.interpolate( images, size=(new_height, new_width), mode="bilinear", align_corners=False, ), boxes, ) def crop_boxes(boxes, x_offset, y_offset): """ Peform crop on the bounding boxes given the offsets. Args: boxes (ndarray or None): bounding boxes to peform crop. The dimension is `num boxes` x 4. x_offset (int): cropping offset in the x axis. y_offset (int): cropping offset in the y axis. Returns: cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ cropped_boxes = boxes.copy() cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset return cropped_boxes def random_crop(images, size, boxes=None): """ Perform random spatial crop on the given images and corresponding boxes. Args: images (tensor): images to perform random crop. The dimension is `num frames` x `channel` x `height` x `width`. size (int): the size of height and width to crop on the image. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: cropped (tensor): cropped images with dimension of `num frames` x `channel` x `size` x `size`. cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ if images.shape[2] == size and images.shape[3] == size: return images height = images.shape[2] width = images.shape[3] y_offset = 0 if height > size: y_offset = int(np.random.randint(0, height - size)) x_offset = 0 if width > size: x_offset = int(np.random.randint(0, width - size)) cropped = images[ :, :, y_offset : y_offset + size, x_offset : x_offset + size ] cropped_boxes = ( crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None ) return cropped, cropped_boxes def horizontal_flip(prob, images, boxes=None): """ Perform horizontal flip on the given images and corresponding boxes. Args: prob (float): probility to flip the images. images (tensor): images to perform horizontal flip, the dimension is `num frames` x `channel` x `height` x `width`. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: images (tensor): images with dimension of `num frames` x `channel` x `height` x `width`. flipped_boxes (ndarray or None): the flipped boxes with dimension of `num boxes` x 4. """ if boxes is None: flipped_boxes = None else: flipped_boxes = boxes.copy() if np.random.uniform() < prob: images = images.flip((-1)) if len(images.shape) == 3: width = images.shape[2] elif len(images.shape) == 4: width = images.shape[3] else: raise NotImplementedError("Dimension does not supported") if boxes is not None: flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1 return images, flipped_boxes def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None): """ Perform uniform spatial sampling on the images and corresponding boxes. Args: images (tensor): images to perform uniform crop. The dimension is `num frames` x `channel` x `height` x `width`. size (int): size of height and weight to crop the images. spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width is larger than height. Or 0, 1, or 2 for top, center, and bottom crop if height is larger than width. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. scale_size (int): optinal. If not None, resize the images to scale_size before performing any crop. Returns: cropped (tensor): images with dimension of `num frames` x `channel` x `size` x `size`. cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ assert spatial_idx in [0, 1, 2] ndim = len(images.shape) if ndim == 3: images = images.unsqueeze(0) height = images.shape[2] width = images.shape[3] if scale_size is not None: if width <= height: width, height = scale_size, int(height / width * scale_size) else: width, height = int(width / height * scale_size), scale_size images = torch.nn.functional.interpolate( images, size=(height, width), mode="bilinear", align_corners=False, ) y_offset = int(math.ceil((height - size) / 2)) x_offset = int(math.ceil((width - size) / 2)) if height > width: if spatial_idx == 0: y_offset = 0 elif spatial_idx == 2: y_offset = height - size else: if spatial_idx == 0: x_offset = 0 elif spatial_idx == 2: x_offset = width - size cropped = images[ :, :, y_offset : y_offset + size, x_offset : x_offset + size ] cropped_boxes = ( crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None ) if ndim == 3: cropped = cropped.squeeze(0) return cropped, cropped_boxes def clip_boxes_to_image(boxes, height, width): """ Clip an array of boxes to an image with the given height and width. Args: boxes (ndarray): bounding boxes to perform clipping. Dimension is `num boxes` x 4. height (int): given image height. width (int): given image width. Returns: clipped_boxes (ndarray): the clipped boxes with dimension of `num boxes` x 4. """ clipped_boxes = boxes.copy() clipped_boxes[:, [0, 2]] = np.minimum( width - 1.0, np.maximum(0.0, boxes[:, [0, 2]]) ) clipped_boxes[:, [1, 3]] = np.minimum( height - 1.0, np.maximum(0.0, boxes[:, [1, 3]]) ) return clipped_boxes def blend(images1, images2, alpha): """ Blend two images with a given weight alpha. Args: images1 (tensor): the first images to be blended, the dimension is `num frames` x `channel` x `height` x `width`. images2 (tensor): the second images to be blended, the dimension is `num frames` x `channel` x `height` x `width`. alpha (float): the blending weight. Returns: (tensor): blended images, the dimension is `num frames` x `channel` x `height` x `width`. """ return images1 * alpha + images2 * (1 - alpha) def grayscale(images): """ Get the grayscale for the input images. The channels of images should be in order BGR. Args: images (tensor): the input images for getting grayscale. Dimension is `num frames` x `channel` x `height` x `width`. Returns: img_gray (tensor): blended images, the dimension is `num frames` x `channel` x `height` x `width`. """ # R -> 0.299, G -> 0.587, B -> 0.114. img_gray = torch.tensor(images) gray_channel = ( 0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0] ) img_gray[:, 0] = gray_channel img_gray[:, 1] = gray_channel img_gray[:, 2] = gray_channel return img_gray def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0): """ Perfrom a color jittering on the input images. The channels of images should be in order BGR. Args: images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. img_brightness (float): jitter ratio for brightness. img_contrast (float): jitter ratio for contrast. img_saturation (float): jitter ratio for saturation. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ jitter = [] if img_brightness != 0: jitter.append("brightness") if img_contrast != 0: jitter.append("contrast") if img_saturation != 0: jitter.append("saturation") if len(jitter) > 0: order = np.random.permutation(np.arange(len(jitter))) for idx in range(0, len(jitter)): if jitter[order[idx]] == "brightness": images = brightness_jitter(img_brightness, images) elif jitter[order[idx]] == "contrast": images = contrast_jitter(img_contrast, images) elif jitter[order[idx]] == "saturation": images = saturation_jitter(img_saturation, images) return images def brightness_jitter(var, images): """ Perfrom brightness jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for brightness. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_bright = torch.zeros(images.shape) images = blend(images, img_bright, alpha) return images def contrast_jitter(var, images): """ Perfrom contrast jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for contrast. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_gray = grayscale(images) img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True) images = blend(images, img_gray, alpha) return images def saturation_jitter(var, images): """ Perfrom saturation jittering on the input images. The channels of images should be in order BGR. Args: var (float): jitter ratio for saturation. images (tensor): images to perform color jitter. Dimension is `num frames` x `channel` x `height` x `width`. Returns: images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ alpha = 1.0 + np.random.uniform(-var, var) img_gray = grayscale(images) images = blend(images, img_gray, alpha) return images def lighting_jitter(images, alphastd, eigval, eigvec): """ Perform AlexNet-style PCA jitter on the given images. Args: images (tensor): images to perform lighting jitter. Dimension is `num frames` x `channel` x `height` x `width`. alphastd (float): jitter ratio for PCA jitter. eigval (list): eigenvalues for PCA jitter. eigvec (list[list]): eigenvectors for PCA jitter. Returns: out_images (tensor): the jittered images, the dimension is `num frames` x `channel` x `height` x `width`. """ if alphastd == 0: return images # generate alpha1, alpha2, alpha3. alpha = np.random.normal(0, alphastd, size=(1, 3)) eig_vec = np.array(eigvec) eig_val = np.reshape(eigval, (1, 3)) rgb = np.sum( eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0), axis=1, ) out_images = torch.zeros_like(images) if len(images.shape) == 3: # C H W channel_dim = 0 elif len(images.shape) == 4: # T C H W channel_dim = 1 else: raise NotImplementedError(f"Unsupported dimension {len(images.shape)}") for idx in range(images.shape[channel_dim]): # C H W if len(images.shape) == 3: out_images[idx] = images[idx] + rgb[2 - idx] # T C H W elif len(images.shape) == 4: out_images[:, idx] = images[:, idx] + rgb[2 - idx] else: raise NotImplementedError( f"Unsupported dimension {len(images.shape)}" ) return out_images def color_normalization(images, mean, stddev): """ Perform color nomration on the given images. Args: images (tensor): images to perform color normalization. Dimension is `num frames` x `channel` x `height` x `width`. mean (list): mean values for normalization. stddev (list): standard deviations for normalization. Returns: out_images (tensor): the noramlized images, the dimension is `num frames` x `channel` x `height` x `width`. """ if len(images.shape) == 3: assert ( len(mean) == images.shape[0] ), "channel mean not computed properly" assert ( len(stddev) == images.shape[0] ), "channel stddev not computed properly" elif len(images.shape) == 4: assert ( len(mean) == images.shape[1] ), "channel mean not computed properly" assert ( len(stddev) == images.shape[1] ), "channel stddev not computed properly" else: raise NotImplementedError(f"Unsupported dimension {len(images.shape)}") out_images = torch.zeros_like(images) for idx in range(len(mean)): # C H W if len(images.shape) == 3: out_images[idx] = (images[idx] - mean[idx]) / stddev[idx] elif len(images.shape) == 4: out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx] else: raise NotImplementedError( f"Unsupported dimension {len(images.shape)}" ) return out_images def _get_param_spatial_crop( scale, ratio, height, width, num_repeat=10, log_scale=True, switch_hw=False ): """ Given scale, ratio, height and width, return sampled coordinates of the videos. """ for _ in range(num_repeat): area = height * width target_area = random.uniform(*scale) * area if log_scale: log_ratio = (math.log(ratio[0]), math.log(ratio[1])) aspect_ratio = math.exp(random.uniform(*log_ratio)) else: aspect_ratio = random.uniform(*ratio) w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) if np.random.uniform() < 0.5 and switch_hw: w, h = h, w if 0 < w <= width and 0 < h <= height: i = random.randint(0, height - h) j = random.randint(0, width - w) return i, j, h, w # Fallback to central crop in_ratio = float(width) / float(height) if in_ratio < min(ratio): w = width h = int(round(w / min(ratio))) elif in_ratio > max(ratio): h = height w = int(round(h * max(ratio))) else: # whole image w = width h = height i = (height - h) // 2 j = (width - w) // 2 return i, j, h, w def random_resized_crop( images, target_height, target_width, scale=(0.8, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), ): """ Crop the given images to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks. Args: images: Images to perform resizing and cropping. target_height: Desired height after cropping. target_width: Desired width after cropping. scale: Scale range of Inception-style area based random resizing. ratio: Aspect ratio range of Inception-style area based random resizing. """ height = images.shape[2] width = images.shape[3] i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width) cropped = images[:, :, i : i + h, j : j + w] return torch.nn.functional.interpolate( cropped, size=(target_height, target_width), mode="bilinear", align_corners=False, ) def random_resized_crop_with_shift( images, target_height, target_width, scale=(0.8, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), ): """ This is similar to random_resized_crop. However, it samples two different boxes (for cropping) for the first and last frame. It then linearly interpolates the two boxes for other frames. Args: images: Images to perform resizing and cropping. target_height: Desired height after cropping. target_width: Desired width after cropping. scale: Scale range of Inception-style area based random resizing. ratio: Aspect ratio range of Inception-style area based random resizing. """ t = images.shape[1] height = images.shape[2] width = images.shape[3] i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width) i_, j_, h_, w_ = _get_param_spatial_crop(scale, ratio, height, width) i_s = [int(i) for i in torch.linspace(i, i_, steps=t).tolist()] j_s = [int(i) for i in torch.linspace(j, j_, steps=t).tolist()] h_s = [int(i) for i in torch.linspace(h, h_, steps=t).tolist()] w_s = [int(i) for i in torch.linspace(w, w_, steps=t).tolist()] out = torch.zeros((3, t, target_height, target_width)) for ind in range(t): out[:, ind : ind + 1, :, :] = torch.nn.functional.interpolate( images[ :, ind : ind + 1, i_s[ind] : i_s[ind] + h_s[ind], j_s[ind] : j_s[ind] + w_s[ind], ], size=(target_height, target_width), mode="bilinear", align_corners=False, ) return out def create_random_augment( input_size, auto_augment=None, interpolation="bilinear", ): """ Get video randaug transform. Args: input_size: The size of the input video in tuple. auto_augment: Parameters for randaug. An example: "rand-m7-n4-mstd0.5-inc1" (m is the magnitude and n is the number of operations to apply). interpolation: Interpolation method. """ if isinstance(input_size, tuple): img_size = input_size[-2:] else: img_size = input_size if auto_augment: assert isinstance(auto_augment, str) if isinstance(img_size, tuple): img_size_min = min(img_size) else: img_size_min = img_size aa_params = {"translate_const": int(img_size_min * 0.45)} if interpolation and interpolation != "random": aa_params["interpolation"] = _pil_interp(interpolation) if auto_augment.startswith("rand"): return transforms.Compose( [rand_augment_transform(auto_augment, aa_params)] ) raise NotImplementedError def random_sized_crop_img( im, size, jitter_scale=(0.08, 1.0), jitter_aspect=(3.0 / 4.0, 4.0 / 3.0), max_iter=10, ): """ Performs Inception-style cropping (used for training). """ assert ( len(im.shape) == 3 ), "Currently only support image for random_sized_crop" h, w = im.shape[1:3] i, j, h, w = _get_param_spatial_crop( scale=jitter_scale, ratio=jitter_aspect, height=h, width=w, num_repeat=max_iter, log_scale=False, switch_hw=True, ) cropped = im[:, i : i + h, j : j + w] return torch.nn.functional.interpolate( cropped.unsqueeze(0), size=(size, size), mode="bilinear", align_corners=False, ).squeeze(0) # The following code are modified based on timm lib, we will replace the following # contents with dependency from PyTorchVideo. # https://github.com/facebookresearch/pytorchvideo class RandomResizedCropAndInterpolation: """Crop the given PIL Image to random size and aspect ratio with random interpolation. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks. Args: size: expected output size of each edge scale: range of size of the origin size cropped ratio: range of aspect ratio of the origin aspect ratio cropped interpolation: Default: PIL.Image.BILINEAR """ def __init__( self, size, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="bilinear", ): if isinstance(size, tuple): self.size = size else: self.size = (size, size) if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): print("range should be of kind (min, max)") if interpolation == "random": self.interpolation = _RANDOM_INTERPOLATION else: self.interpolation = _pil_interp(interpolation) self.scale = scale self.ratio = ratio @staticmethod def get_params(img, scale, ratio): """Get parameters for ``crop`` for a random sized crop. Args: img (PIL Image): Image to be cropped. scale (tuple): range of size of the origin size cropped ratio (tuple): range of aspect ratio of the origin aspect ratio cropped Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for a random sized crop. """ area = img.size[0] * img.size[1] for _ in range(10): target_area = random.uniform(*scale) * area log_ratio = (math.log(ratio[0]), math.log(ratio[1])) aspect_ratio = math.exp(random.uniform(*log_ratio)) w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) if w <= img.size[0] and h <= img.size[1]: i = random.randint(0, img.size[1] - h) j = random.randint(0, img.size[0] - w) return i, j, h, w # Fallback to central crop in_ratio = img.size[0] / img.size[1] if in_ratio < min(ratio): w = img.size[0] h = int(round(w / min(ratio))) elif in_ratio > max(ratio): h = img.size[1] w = int(round(h * max(ratio))) else: # whole image w = img.size[0] h = img.size[1] i = (img.size[1] - h) // 2 j = (img.size[0] - w) // 2 return i, j, h, w def __call__(self, img): """ Args: img (PIL Image): Image to be cropped and resized. Returns: PIL Image: Randomly cropped and resized image. """ i, j, h, w = self.get_params(img, self.scale, self.ratio) if isinstance(self.interpolation, (tuple, list)): interpolation = random.choice(self.interpolation) else: interpolation = self.interpolation return F.resized_crop(img, i, j, h, w, self.size, interpolation) def __repr__(self): if isinstance(self.interpolation, (tuple, list)): interpolate_str = " ".join( [_pil_interpolation_to_str[x] for x in self.interpolation] ) else: interpolate_str = _pil_interpolation_to_str[self.interpolation] format_string = self.__class__.__name__ + "(size={0}".format(self.size) format_string += ", scale={0}".format( tuple(round(s, 4) for s in self.scale) ) format_string += ", ratio={0}".format( tuple(round(r, 4) for r in self.ratio) ) format_string += ", interpolation={0})".format(interpolate_str) return format_string def transforms_imagenet_train( img_size=224, scale=None, ratio=None, hflip=0.5, vflip=0.0, color_jitter=0.4, auto_augment=None, interpolation="random", use_prefetcher=False, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), re_prob=0.0, re_mode="const", re_count=1, re_num_splits=0, separate=False, ): """ If separate==True, the transforms are returned as a tuple of 3 separate transforms for use in a mixing dataset that passes * all data through the first (primary) transform, called the 'clean' data * a portion of the data through the secondary transform * normalizes and converts the branches above with the third, final transform """ if isinstance(img_size, tuple): img_size = img_size[-2:] else: img_size = img_size scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range ratio = tuple( ratio or (3.0 / 4.0, 4.0 / 3.0) ) # default imagenet ratio range primary_tfl = [ RandomResizedCropAndInterpolation( img_size, scale=scale, ratio=ratio, interpolation=interpolation ) ] if hflip > 0.0: primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)] if vflip > 0.0: primary_tfl += [transforms.RandomVerticalFlip(p=vflip)] secondary_tfl = [] if auto_augment: assert isinstance(auto_augment, str) if isinstance(img_size, tuple): img_size_min = min(img_size) else: img_size_min = img_size aa_params = dict( translate_const=int(img_size_min * 0.45), img_mean=tuple([min(255, round(255 * x)) for x in mean]), ) if interpolation and interpolation != "random": aa_params["interpolation"] = _pil_interp(interpolation) if auto_augment.startswith("rand"): secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] elif auto_augment.startswith("augmix"): raise NotImplementedError("Augmix not implemented") else: raise NotImplementedError("Auto aug not implemented") elif color_jitter is not None: # color jitter is enabled when not using AA if isinstance(color_jitter, (list, tuple)): # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation # or 4 if also augmenting hue assert len(color_jitter) in (3, 4) else: # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue color_jitter = (float(color_jitter),) * 3 secondary_tfl += [transforms.ColorJitter(*color_jitter)] final_tfl = [] final_tfl += [ transforms.ToTensor(), transforms.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)), ] if re_prob > 0.0: final_tfl.append( RandomErasing( re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device="cpu", cube=False, ) ) if separate: return ( transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl), ) else: return transforms.Compose(primary_tfl + secondary_tfl + final_tfl) ############################################################################################################ ############################################################################################################ class Compose(object): """Composes several transforms Args: transforms (list of ``Transform`` objects): list of transforms to compose """ def __init__(self, transforms): self.transforms = transforms def __call__(self, clip): for t in self.transforms: clip = t(clip) return clip class RandomHorizontalFlip(object): """Horizontally flip the list of given images randomly with a probability 0.5 """ def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Randomly flipped clip """ if random.random() < 0.5: if isinstance(clip[0], np.ndarray): return [np.fliplr(img) for img in clip] elif isinstance(clip[0], PIL.Image.Image): return [ img.transpose(PIL.Image.FLIP_LEFT_RIGHT) for img in clip ] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + ' but got list of {0}'.format(type(clip[0]))) return clip class RandomResize(object): """Resizes a list of (H x W x C) numpy.ndarray to the final size The larger the original image is, the more times it takes to interpolate Args: interpolation (str): Can be one of 'nearest', 'bilinear' defaults to nearest size (tuple): (widht, height) """ def __init__(self, ratio=(3. / 4., 4. / 3.), interpolation='nearest'): self.ratio = ratio self.interpolation = interpolation def __call__(self, clip): scaling_factor = random.uniform(self.ratio[0], self.ratio[1]) if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size new_w = int(im_w * scaling_factor) new_h = int(im_h * scaling_factor) new_size = (new_w, new_h) resized = FF.resize_clip( clip, new_size, interpolation=self.interpolation) return resized class Resize(object): """Resizes a list of (H x W x C) numpy.ndarray to the final size The larger the original image is, the more times it takes to interpolate Args: interpolation (str): Can be one of 'nearest', 'bilinear' defaults to nearest size (tuple): (widht, height) """ def __init__(self, size, interpolation='nearest'): self.size = size self.interpolation = interpolation def __call__(self, clip): resized = FF.resize_clip( clip, self.size, interpolation=self.interpolation) return resized class RandomCrop(object): """Extract random crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w > im_w or h > im_h: error_msg = ( 'Initial image size should be larger then ' 'cropped size but got cropped sizes : ({w}, {h}) while ' 'initial image is ({im_w}, {im_h})'.format( im_w=im_w, im_h=im_h, w=w, h=h)) raise ValueError(error_msg) x1 = random.randint(0, im_w - w) y1 = random.randint(0, im_h - h) cropped = FF.crop_clip(clip, y1, x1, h, w) return cropped class ThreeCrop(object): """Extract random crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w != im_w and h != im_h: clip = FF.resize_clip(clip, self.size, interpolation="bilinear") im_h, im_w, im_c = clip[0].shape step = np.max((np.max((im_w, im_h)) - self.size[0]) // 2, 0) cropped = [] for i in range(3): if (im_h > self.size[0]): x1 = 0 y1 = i * step cropped.extend(FF.crop_clip(clip, y1, x1, h, w)) else: x1 = i * step y1 = 0 cropped.extend(FF.crop_clip(clip, y1, x1, h, w)) return cropped class RandomRotation(object): """Rotate entire clip randomly by a random angle within given bounds Args: degrees (sequence or int): Range of degrees to select from If degrees is a number instead of sequence like (min, max), the range of degrees, will be (-degrees, +degrees). """ def __init__(self, degrees): if isinstance(degrees, numbers.Number): if degrees < 0: raise ValueError('If degrees is a single number,' 'must be positive') degrees = (-degrees, degrees) else: if len(degrees) != 2: raise ValueError('If degrees is a sequence,' 'it must be of len 2.') self.degrees = degrees def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ import skimage angle = random.uniform(self.degrees[0], self.degrees[1]) if isinstance(clip[0], np.ndarray): rotated = [skimage.transform.rotate(img, angle) for img in clip] elif isinstance(clip[0], PIL.Image.Image): rotated = [img.rotate(angle) for img in clip] else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return rotated class CenterCrop(object): """Extract center crop at the same location for a list of images Args: size (sequence or int): Desired output size for the crop in format (h, w) """ def __init__(self, size): if isinstance(size, numbers.Number): size = (size, size) self.size = size def __call__(self, clip): """ Args: img (PIL.Image or numpy.ndarray): List of images to be cropped in format (h, w, c) in numpy.ndarray Returns: PIL.Image or numpy.ndarray: Cropped list of images """ h, w = self.size if isinstance(clip[0], np.ndarray): im_h, im_w, im_c = clip[0].shape elif isinstance(clip[0], PIL.Image.Image): im_w, im_h = clip[0].size else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) if w > im_w or h > im_h: error_msg = ( 'Initial image size should be larger then ' 'cropped size but got cropped sizes : ({w}, {h}) while ' 'initial image is ({im_w}, {im_h})'.format( im_w=im_w, im_h=im_h, w=w, h=h)) raise ValueError(error_msg) x1 = int(round((im_w - w) / 2.)) y1 = int(round((im_h - h) / 2.)) cropped = FF.crop_clip(clip, y1, x1, h, w) return cropped class ColorJitter(object): """Randomly change the brightness, contrast and saturation and hue of the clip Args: brightness (float): How much to jitter brightness. brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. contrast (float): How much to jitter contrast. contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. saturation (float): How much to jitter saturation. saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. hue(float): How much to jitter hue. hue_factor is chosen uniformly from [-hue, hue]. Should be >=0 and <= 0.5. """ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): self.brightness = brightness self.contrast = contrast self.saturation = saturation self.hue = hue def get_params(self, brightness, contrast, saturation, hue): if brightness > 0: brightness_factor = random.uniform( max(0, 1 - brightness), 1 + brightness) else: brightness_factor = None if contrast > 0: contrast_factor = random.uniform( max(0, 1 - contrast), 1 + contrast) else: contrast_factor = None if saturation > 0: saturation_factor = random.uniform( max(0, 1 - saturation), 1 + saturation) else: saturation_factor = None if hue > 0: hue_factor = random.uniform(-hue, hue) else: hue_factor = None return brightness_factor, contrast_factor, saturation_factor, hue_factor def __call__(self, clip): """ Args: clip (list): list of PIL.Image Returns: list PIL.Image : list of transformed PIL.Image """ if isinstance(clip[0], np.ndarray): raise TypeError( 'Color jitter not yet implemented for numpy arrays') elif isinstance(clip[0], PIL.Image.Image): brightness, contrast, saturation, hue = self.get_params( self.brightness, self.contrast, self.saturation, self.hue) # Create img transform function sequence img_transforms = [] if brightness is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_brightness(img, brightness)) if saturation is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_saturation(img, saturation)) if hue is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_hue(img, hue)) if contrast is not None: img_transforms.append(lambda img: torchvision.transforms.functional.adjust_contrast(img, contrast)) random.shuffle(img_transforms) # Apply to all images jittered_clip = [] for img in clip: for func in img_transforms: jittered_img = func(img) jittered_clip.append(jittered_img) else: raise TypeError('Expected numpy.ndarray or PIL.Image' + 'but got list of {0}'.format(type(clip[0]))) return jittered_clip class Normalize(object): """Normalize a clip with mean and standard deviation. Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform will normalize each channel of the input ``torch.*Tensor`` i.e. ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` .. note:: This transform acts out of place, i.e., it does not mutates the input tensor. Args: mean (sequence): Sequence of means for each channel. std (sequence): Sequence of standard deviations for each channel. """ def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, clip): """ Args: clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized. Returns: Tensor: Normalized Tensor clip. """ return FF.normalize(clip, self.mean, self.std) def __repr__(self): return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)