|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
import torch.nn as nn |
|
import numpy as np |
|
from torchvision.transforms import Normalize |
|
|
|
|
|
def denormalize(x): |
|
"""Reverses the imagenet normalization applied to the input. |
|
|
|
Args: |
|
x (torch.Tensor - shape(N,3,H,W)): input tensor |
|
|
|
Returns: |
|
torch.Tensor - shape(N,3,H,W): Denormalized input |
|
""" |
|
mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device) |
|
std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device) |
|
return x * std + mean |
|
|
|
def get_activation(name, bank): |
|
def hook(model, input, output): |
|
bank[name] = output |
|
return hook |
|
|
|
|
|
class Resize(object): |
|
"""Resize sample to given size (width, height). |
|
""" |
|
|
|
def __init__( |
|
self, |
|
width, |
|
height, |
|
resize_target=True, |
|
keep_aspect_ratio=False, |
|
ensure_multiple_of=1, |
|
resize_method="lower_bound", |
|
): |
|
"""Init. |
|
Args: |
|
width (int): desired output width |
|
height (int): desired output height |
|
resize_target (bool, optional): |
|
True: Resize the full sample (image, mask, target). |
|
False: Resize image only. |
|
Defaults to True. |
|
keep_aspect_ratio (bool, optional): |
|
True: Keep the aspect ratio of the input sample. |
|
Output sample might not have the given width and height, and |
|
resize behaviour depends on the parameter 'resize_method'. |
|
Defaults to False. |
|
ensure_multiple_of (int, optional): |
|
Output width and height is constrained to be multiple of this parameter. |
|
Defaults to 1. |
|
resize_method (str, optional): |
|
"lower_bound": Output will be at least as large as the given size. |
|
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) |
|
"minimal": Scale as least as possible. (Output size might be smaller than given size.) |
|
Defaults to "lower_bound". |
|
""" |
|
print("Params passed to Resize transform:") |
|
print("\twidth: ", width) |
|
print("\theight: ", height) |
|
print("\tresize_target: ", resize_target) |
|
print("\tkeep_aspect_ratio: ", keep_aspect_ratio) |
|
print("\tensure_multiple_of: ", ensure_multiple_of) |
|
print("\tresize_method: ", resize_method) |
|
|
|
self.__width = width |
|
self.__height = height |
|
|
|
self.__keep_aspect_ratio = keep_aspect_ratio |
|
self.__multiple_of = ensure_multiple_of |
|
self.__resize_method = resize_method |
|
|
|
def constrain_to_multiple_of(self, x, min_val=0, max_val=None): |
|
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) |
|
|
|
if max_val is not None and y > max_val: |
|
y = (np.floor(x / self.__multiple_of) |
|
* self.__multiple_of).astype(int) |
|
|
|
if y < min_val: |
|
y = (np.ceil(x / self.__multiple_of) |
|
* self.__multiple_of).astype(int) |
|
|
|
return y |
|
|
|
def get_size(self, width, height): |
|
|
|
scale_height = self.__height / height |
|
scale_width = self.__width / width |
|
|
|
if self.__keep_aspect_ratio: |
|
if self.__resize_method == "lower_bound": |
|
|
|
if scale_width > scale_height: |
|
|
|
scale_height = scale_width |
|
else: |
|
|
|
scale_width = scale_height |
|
elif self.__resize_method == "upper_bound": |
|
|
|
if scale_width < scale_height: |
|
|
|
scale_height = scale_width |
|
else: |
|
|
|
scale_width = scale_height |
|
elif self.__resize_method == "minimal": |
|
|
|
if abs(1 - scale_width) < abs(1 - scale_height): |
|
|
|
scale_height = scale_width |
|
else: |
|
|
|
scale_width = scale_height |
|
else: |
|
raise ValueError( |
|
f"resize_method {self.__resize_method} not implemented" |
|
) |
|
|
|
if self.__resize_method == "lower_bound": |
|
new_height = self.constrain_to_multiple_of( |
|
scale_height * height, min_val=self.__height |
|
) |
|
new_width = self.constrain_to_multiple_of( |
|
scale_width * width, min_val=self.__width |
|
) |
|
elif self.__resize_method == "upper_bound": |
|
new_height = self.constrain_to_multiple_of( |
|
scale_height * height, max_val=self.__height |
|
) |
|
new_width = self.constrain_to_multiple_of( |
|
scale_width * width, max_val=self.__width |
|
) |
|
elif self.__resize_method == "minimal": |
|
new_height = self.constrain_to_multiple_of(scale_height * height) |
|
new_width = self.constrain_to_multiple_of(scale_width * width) |
|
else: |
|
raise ValueError( |
|
f"resize_method {self.__resize_method} not implemented") |
|
|
|
return (new_width, new_height) |
|
|
|
def __call__(self, x): |
|
width, height = self.get_size(*x.shape[-2:][::-1]) |
|
return nn.functional.interpolate(x, (height, width), mode='bilinear', align_corners=True) |
|
|
|
class PrepForMidas(object): |
|
def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True): |
|
if isinstance(img_size, int): |
|
img_size = (img_size, img_size) |
|
net_h, net_w = img_size |
|
self.normalization = Normalize( |
|
mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) |
|
self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \ |
|
if do_resize else nn.Identity() |
|
|
|
def __call__(self, x): |
|
return self.normalization(self.resizer(x)) |
|
|
|
|
|
class MidasCore(nn.Module): |
|
def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True, |
|
img_size=384, **kwargs): |
|
"""Midas Base model used for multi-scale feature extraction. |
|
|
|
Args: |
|
midas (torch.nn.Module): Midas model. |
|
trainable (bool, optional): Train midas model. Defaults to False. |
|
fetch_features (bool, optional): Extract multi-scale features. Defaults to True. |
|
layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'). |
|
freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False. |
|
keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True. |
|
img_size (int, tuple, optional): Input resolution. Defaults to 384. |
|
""" |
|
super().__init__() |
|
self.core = midas |
|
self.output_channels = None |
|
self.core_out = {} |
|
self.trainable = trainable |
|
self.fetch_features = fetch_features |
|
|
|
self.handles = [] |
|
|
|
self.layer_names = layer_names |
|
|
|
self.set_trainable(trainable) |
|
self.set_fetch_features(fetch_features) |
|
|
|
self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio, |
|
img_size=img_size, do_resize=kwargs.get('do_resize', True)) |
|
|
|
if freeze_bn: |
|
self.freeze_bn() |
|
|
|
def set_trainable(self, trainable): |
|
self.trainable = trainable |
|
if trainable: |
|
self.unfreeze() |
|
else: |
|
self.freeze() |
|
return self |
|
|
|
def set_fetch_features(self, fetch_features): |
|
self.fetch_features = fetch_features |
|
if fetch_features: |
|
if len(self.handles) == 0: |
|
self.attach_hooks(self.core) |
|
else: |
|
self.remove_hooks() |
|
return self |
|
|
|
def freeze(self): |
|
for p in self.parameters(): |
|
p.requires_grad = False |
|
self.trainable = False |
|
return self |
|
|
|
def unfreeze(self): |
|
for p in self.parameters(): |
|
p.requires_grad = True |
|
self.trainable = True |
|
return self |
|
|
|
def freeze_bn(self): |
|
for m in self.modules(): |
|
if isinstance(m, nn.BatchNorm2d): |
|
m.eval() |
|
return self |
|
|
|
def forward(self, x, denorm=False, return_rel_depth=False): |
|
with torch.no_grad(): |
|
if denorm: |
|
x = denormalize(x) |
|
x = self.prep(x) |
|
|
|
|
|
with torch.set_grad_enabled(self.trainable): |
|
|
|
|
|
rel_depth = self.core(x) |
|
|
|
if not self.fetch_features: |
|
return rel_depth |
|
out = [self.core_out[k] for k in self.layer_names] |
|
|
|
if return_rel_depth: |
|
return rel_depth, out |
|
return out |
|
|
|
def get_rel_pos_params(self): |
|
for name, p in self.core.pretrained.named_parameters(): |
|
if "relative_position" in name: |
|
yield p |
|
|
|
def get_enc_params_except_rel_pos(self): |
|
for name, p in self.core.pretrained.named_parameters(): |
|
if "relative_position" not in name: |
|
yield p |
|
|
|
def freeze_encoder(self, freeze_rel_pos=False): |
|
if freeze_rel_pos: |
|
for p in self.core.pretrained.parameters(): |
|
p.requires_grad = False |
|
else: |
|
for p in self.get_enc_params_except_rel_pos(): |
|
p.requires_grad = False |
|
return self |
|
|
|
def attach_hooks(self, midas): |
|
if len(self.handles) > 0: |
|
self.remove_hooks() |
|
if "out_conv" in self.layer_names: |
|
self.handles.append(list(midas.scratch.output_conv.children())[ |
|
3].register_forward_hook(get_activation("out_conv", self.core_out))) |
|
if "r4" in self.layer_names: |
|
self.handles.append(midas.scratch.refinenet4.register_forward_hook( |
|
get_activation("r4", self.core_out))) |
|
if "r3" in self.layer_names: |
|
self.handles.append(midas.scratch.refinenet3.register_forward_hook( |
|
get_activation("r3", self.core_out))) |
|
if "r2" in self.layer_names: |
|
self.handles.append(midas.scratch.refinenet2.register_forward_hook( |
|
get_activation("r2", self.core_out))) |
|
if "r1" in self.layer_names: |
|
self.handles.append(midas.scratch.refinenet1.register_forward_hook( |
|
get_activation("r1", self.core_out))) |
|
if "l4_rn" in self.layer_names: |
|
self.handles.append(midas.scratch.layer4_rn.register_forward_hook( |
|
get_activation("l4_rn", self.core_out))) |
|
|
|
return self |
|
|
|
def remove_hooks(self): |
|
for h in self.handles: |
|
h.remove() |
|
return self |
|
|
|
def __del__(self): |
|
self.remove_hooks() |
|
|
|
def set_output_channels(self, model_type): |
|
self.output_channels = MIDAS_SETTINGS[model_type] |
|
|
|
@staticmethod |
|
def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs): |
|
if midas_model_type not in MIDAS_SETTINGS: |
|
raise ValueError( |
|
f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}") |
|
if "img_size" in kwargs: |
|
kwargs = MidasCore.parse_img_size(kwargs) |
|
img_size = kwargs.pop("img_size", [384, 384]) |
|
print("img_size", img_size) |
|
midas_path = os.path.join(os.path.dirname(__file__), 'midas_repo') |
|
midas = torch.hub.load(midas_path, midas_model_type, |
|
pretrained=use_pretrained_midas, force_reload=force_reload, source='local') |
|
kwargs.update({'keep_aspect_ratio': force_keep_ar}) |
|
midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features, |
|
freeze_bn=freeze_bn, img_size=img_size, **kwargs) |
|
midas_core.set_output_channels(midas_model_type) |
|
return midas_core |
|
|
|
@staticmethod |
|
def build_from_config(config): |
|
return MidasCore.build(**config) |
|
|
|
@staticmethod |
|
def parse_img_size(config): |
|
assert 'img_size' in config |
|
if isinstance(config['img_size'], str): |
|
assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W" |
|
config['img_size'] = list(map(int, config['img_size'].split(","))) |
|
assert len( |
|
config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W" |
|
elif isinstance(config['img_size'], int): |
|
config['img_size'] = [config['img_size'], config['img_size']] |
|
else: |
|
assert isinstance(config['img_size'], list) and len( |
|
config['img_size']) == 2, "img_size should be a list of H,W" |
|
return config |
|
|
|
|
|
nchannels2models = { |
|
tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"], |
|
(512, 256, 128, 64, 64): ["MiDaS_small"] |
|
} |
|
|
|
|
|
MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items() |
|
for m in v |
|
} |
|
|