PatchFusion / zoedepth /models /depth_model.py
Zhenyu Li
update
78ab311
# MIT License
# Copyright (c) 2022 Intelligent Systems Lab Org
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# File author: Shariq Farooq Bhat
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import PIL.Image
from PIL import Image
from typing import Union
class DepthModel(nn.Module):
def __init__(self):
super().__init__()
self.device = 'cpu'
def to(self, device) -> nn.Module:
self.device = device
return super().to(device)
def forward(self, x, *args, **kwargs):
raise NotImplementedError
def _infer(self, x: torch.Tensor):
"""
Inference interface for the model
Args:
x (torch.Tensor): input tensor of shape (b, c, h, w)
Returns:
torch.Tensor: output tensor of shape (b, 1, h, w)
"""
return self(x)['metric_depth']
def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor:
"""
Inference interface for the model with padding augmentation
Padding augmentation fixes the boundary artifacts in the output depth map.
Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image.
This augmentation pads the input image and crops the prediction back to the original size / view.
Note: This augmentation is not required for the models trained with 'avoid_boundary'=True.
Args:
x (torch.Tensor): input tensor of shape (b, c, h, w)
pad_input (bool, optional): whether to pad the input or not. Defaults to True.
fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3.
fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3.
upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'.
padding_mode (str, optional): padding mode. Defaults to "reflect".
Returns:
torch.Tensor: output tensor of shape (b, 1, h, w)
"""
# assert x is nchw and c = 3
assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim())
assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1])
if pad_input:
assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0"
pad_h = int(np.sqrt(x.shape[2]/2) * fh)
pad_w = int(np.sqrt(x.shape[3]/2) * fw)
padding = [pad_w, pad_w]
if pad_h > 0:
padding += [pad_h, pad_h]
x = F.pad(x, padding, mode=padding_mode, **kwargs)
out = self._infer(x)
if out.shape[-2:] != x.shape[-2:]:
out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False)
if pad_input:
# crop to the original size, handling the case where pad_h and pad_w is 0
if pad_h > 0:
out = out[:, :, pad_h:-pad_h,:]
if pad_w > 0:
out = out[:, :, :, pad_w:-pad_w]
return out
def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor:
"""
Inference interface for the model with horizontal flip augmentation
Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip.
Args:
x (torch.Tensor): input tensor of shape (b, c, h, w)
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
Returns:
torch.Tensor: output tensor of shape (b, 1, h, w)
"""
# infer with horizontal flip and average
out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs)
out = (out + torch.flip(out_flip, dims=[3])) / 2
return out
def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor:
"""
Inference interface for the model
Args:
x (torch.Tensor): input tensor of shape (b, c, h, w)
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
Returns:
torch.Tensor: output tensor of shape (b, 1, h, w)
"""
if with_flip_aug:
return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs)
else:
return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs)
@torch.no_grad()
def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]:
"""
Inference interface for the model for PIL image
Args:
pil_img (PIL.Image.Image): input PIL image
pad_input (bool, optional): whether to use padding augmentation. Defaults to True.
with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True.
output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy".
"""
x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device)
out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs)
if output_type == "numpy":
return out_tensor.squeeze().cpu().numpy()
elif output_type == "pil":
# uint16 is required for depth pil image
out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16)
return Image.fromarray(out_16bit_numpy)
elif output_type == "tensor":
return out_tensor.squeeze().cpu()
else:
raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'")