Spaces:
Runtime error
Runtime error
# Copyright (c) Facebook, Inc. and its affiliates. | |
# Copyright (c) Meta Platforms, Inc. All Rights Reserved | |
from collections import OrderedDict | |
import torch | |
import torch.nn as nn | |
from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec | |
class Bottleneck(nn.Module): | |
expansion = 4 | |
def __init__(self, inplanes, planes, stride=1, dilation=1): | |
super().__init__() | |
# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 | |
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) | |
self.bn1 = nn.BatchNorm2d(planes) | |
self.conv2 = nn.Conv2d( | |
planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation | |
) | |
self.bn2 = nn.BatchNorm2d(planes) | |
self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() | |
self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) | |
self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |
self.relu = nn.ReLU(inplace=True) | |
self.downsample = None | |
self.stride = stride | |
if stride > 1 or inplanes != planes * Bottleneck.expansion: | |
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 | |
self.downsample = nn.Sequential( | |
OrderedDict( | |
[ | |
("-1", nn.AvgPool2d(stride)), | |
( | |
"0", | |
nn.Conv2d( | |
inplanes, | |
planes * self.expansion, | |
1, | |
stride=1, | |
bias=False, | |
), | |
), | |
("1", nn.BatchNorm2d(planes * self.expansion)), | |
] | |
) | |
) | |
def forward(self, x: torch.Tensor): | |
identity = x | |
out = self.relu(self.bn1(self.conv1(x))) | |
out = self.relu(self.bn2(self.conv2(out))) | |
out = self.avgpool(out) | |
out = self.bn3(self.conv3(out)) | |
if self.downsample is not None: | |
identity = self.downsample(x) | |
out += identity | |
out = self.relu(out) | |
return out | |
class ModifiedResNet(nn.Module): | |
""" | |
A ResNet class that is similar to torchvision's but contains the following changes: | |
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |
- The final pooling layer is a QKV attention instead of an average pool | |
""" | |
def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]): | |
super().__init__() | |
# the 3-layer stem | |
self.conv1 = nn.Conv2d( | |
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False | |
) | |
self.bn1 = nn.BatchNorm2d(width // 2) | |
self.conv2 = nn.Conv2d( | |
width // 2, width // 2, kernel_size=3, padding=1, bias=False | |
) | |
self.bn2 = nn.BatchNorm2d(width // 2) | |
self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) | |
self.bn3 = nn.BatchNorm2d(width) | |
self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity() | |
self.relu = nn.ReLU(inplace=True) | |
# residual layers | |
self._inplanes = width # this is a *mutable* variable used during construction | |
self.layer1 = self._make_layer(width, layers[0], stride=strides[1]) | |
self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2]) | |
self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3]) | |
self.layer4 = self._make_layer( | |
width * 8, layers[3], stride=strides[4], dilations=multi_grid | |
) | |
self.num_features = [width * 4, width * 8, width * 16, width * 32] | |
def _make_layer(self, planes, blocks, stride=1, dilations=None): | |
if dilations is None: | |
dilations = [1] * blocks | |
layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])] | |
self._inplanes = planes * Bottleneck.expansion | |
for i in range(1, blocks): | |
layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i])) | |
return nn.Sequential(*layers) | |
def forward(self, x): | |
def stem(x): | |
for conv, bn in [ | |
(self.conv1, self.bn1), | |
(self.conv2, self.bn2), | |
(self.conv3, self.bn3), | |
]: | |
x = self.relu(bn(conv(x))) | |
x = self.avgpool(x) | |
return x | |
output = {} | |
x = x.type(self.conv1.weight.dtype) | |
x = stem(x) # 1/4,1/4 | |
x = self.layer1(x) | |
output["res2"] = x | |
x = self.layer2(x) # 1/8,1/8 | |
output["res3"] = x | |
x = self.layer3(x) # 1/16,1/16 | |
output["res4"] = x | |
x = self.layer4(x) # 1/32,1/32 | |
output["res5"] = x | |
return output | |
class D2ModifiedResNet(ModifiedResNet, Backbone): | |
def __init__(self, cfg, input_shape): | |
depth = cfg.MODEL.RESNETS.DEPTH | |
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS | |
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP | |
bottleneck_channels = num_groups * width_per_group | |
num_blocks_per_stage = { | |
18: [2, 2, 2, 2], | |
34: [3, 4, 6, 3], | |
50: [3, 4, 6, 3], | |
101: [3, 4, 23, 3], | |
152: [3, 8, 36, 3], | |
}[depth] | |
strides = [2, 1, 2, 2, 2] | |
multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID | |
if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab": | |
strides = [1, 1, 2, 2, 2] | |
super().__init__( | |
num_blocks_per_stage, | |
bottleneck_channels, | |
strides=strides, | |
multi_grid=multi_grid, | |
) | |
self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES | |
self._out_feature_strides = { | |
"res2": 4, | |
"res3": 8, | |
"res4": 16, | |
"res5": 32, | |
} | |
self._out_feature_channels = { | |
"res2": self.num_features[0], | |
"res3": self.num_features[1], | |
"res4": self.num_features[2], | |
"res5": self.num_features[3], | |
} | |
def forward(self, x): | |
""" | |
Args: | |
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. | |
Returns: | |
dict[str->Tensor]: names and the corresponding features | |
""" | |
outputs = {} | |
y = super().forward(x) | |
for k in y.keys(): | |
if k in self._out_features: | |
outputs[k] = y[k] | |
return outputs | |
def output_shape(self): | |
return { | |
name: ShapeSpec( | |
channels=self._out_feature_channels[name], | |
stride=self._out_feature_strides[name], | |
) | |
for name in self._out_features | |
} | |
def size_divisibility(self): | |
return 32 | |