Spaces:

zhyever
/

PatchFusion

Runtime error

PatchFusion / zoedepth /models /layers /fusion_network.py

Zhenyu Li

update

78ab311 about 1 year ago

8.7 kB

	# MIT License

	# Copyright (c) 2022 Intelligent Systems Lab Org

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# File author: Zhenyu Li


	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from zoedepth.models.layers.swin_layers import G2LFusion
	from zoedepth.models.layers.transformer import TransformerEncoder, TransformerEncoderLayer
	from torchvision.ops import roi_align as torch_roi_align

	class DoubleConvWOBN(nn.Module):
	"""(convolution => [BN] => ReLU) * 2"""

	def __init__(self, in_channels, out_channels, mid_channels=None):
	super().__init__()
	if not mid_channels:
	mid_channels = out_channels
	self.double_conv = nn.Sequential(
	nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=True),
	# nn.BatchNorm2d(mid_channels),
	nn.ReLU(inplace=True),
	nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=True),
	# nn.BatchNorm2d(mid_channels),
	nn.ReLU(inplace=True))

	def forward(self, x):
	return self.double_conv(x)

	class DoubleConv(nn.Module):
	"""(convolution => [BN] => ReLU) * 2"""

	def __init__(self, in_channels, out_channels, mid_channels=None):
	super().__init__()
	if not mid_channels:
	mid_channels = out_channels
	self.double_conv = nn.Sequential(
	nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
	nn.BatchNorm2d(mid_channels),
	nn.ReLU(inplace=True),
	nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
	nn.BatchNorm2d(out_channels),
	nn.ReLU(inplace=True)
	)

	def forward(self, x):
	return self.double_conv(x)


	class Down(nn.Module):
	"""Downscaling with maxpool then double conv"""

	def __init__(self, in_channels, out_channels):
	super().__init__()
	self.maxpool_conv = nn.Sequential(
	nn.MaxPool2d(2),
	DoubleConv(in_channels, out_channels)
	)

	def forward(self, x):
	return self.maxpool_conv(x)

	class Upv1(nn.Module):
	"""Upscaling then double conv"""

	def __init__(self, in_channels, out_channels, mid_channels=None):
	super().__init__()
	self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)

	if mid_channels is not None:
	self.conv = DoubleConvWOBN(in_channels, out_channels, mid_channels)
	else:
	self.conv = DoubleConvWOBN(in_channels, out_channels, in_channels)

	def forward(self, x1, x2):
	x1 = self.up(x1)
	x = torch.cat([x2, x1], dim=1)
	return self.conv(x)

	class UNetv1(nn.Module):
	def __init__(self, n_channels, g2l, pos_embed=False, use_area_prior=True):
	super(UNetv1, self).__init__()
	self.n_channels = n_channels

	self.inc = DoubleConv(n_channels, 32)
	self.down1 = Down(32, 256)
	self.down2 = Down(256, 256)
	self.down3 = Down(256, 256)
	self.down4 = Down(256, 256)
	self.down5 = Down(256, 256)

	self.up1 = Upv1(256+256+256, 256, 384)
	self.up2 = Upv1(256+256+256, 256, 384)
	self.up3 = Upv1(256+256+256, 256, 384)
	self.up4 = Upv1(256+256+256, 256, 384)
	self.up5 = Upv1(256+32+256, 32, 272)

	self.g2l = g2l

	if self.g2l:
	self.g2l_att = nn.ModuleList()
	win = 12
	in_channels = [32, 256, 256, 256, 256, 256]
	crf_dims = [32, 256, 256, 256, 256, 256]

	self.g2l5 = G2LFusion(input_dim=in_channels[5], embed_dim=crf_dims[5], window_size=win, num_heads=32, depth=4, num_patches=12*16)
	self.g2l4 = G2LFusion(input_dim=in_channels[4], embed_dim=crf_dims[4], window_size=win, num_heads=32, depth=4, num_patches=24*32)
	self.g2l3 = G2LFusion(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, num_heads=16, depth=3, num_patches=48*64)
	self.g2l2 = G2LFusion(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, num_heads=16, depth=3, num_patches=96*128)
	self.g2l1 = G2LFusion(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, num_heads=8, depth=2, num_patches=192*256)
	self.g2l0 = G2LFusion(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, num_heads=8, depth=2, num_patches=384*512)

	self.conv5 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4])
	self.conv4 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4])
	self.conv3 = DoubleConvWOBN(in_channels[3] * 2, in_channels[3], in_channels[3])
	self.conv2 = DoubleConvWOBN(in_channels[2] * 2, in_channels[2], in_channels[2])
	self.conv1 = DoubleConvWOBN(in_channels[1] * 2, in_channels[1], in_channels[1])
	self.conv0 = DoubleConvWOBN(in_channels[0] * 2, in_channels[0], in_channels[0])

	def forward(self,
	input_tensor,
	guide_plus,
	guide_cat,
	crop_area_resize=None,
	bbox=None,
	fine_feat_crop=None,
	coarse_feat_whole=None,
	coarse_feat_whole_hack=None,
	coarse_feat_crop=None):

	# apply unscaled feat to swin
	if coarse_feat_whole_hack is not None:
	coarse_feat_whole = coarse_feat_whole_hack

	if crop_area_resize is None:
	not_use_prior = True
	else:
	not_use_prior = False

	x1 = self.inc(input_tensor)
	x2 = self.down1(x1)
	x3 = self.down2(x2)
	x4 = self.down3(x3)
	x5 = self.down4(x4)
	x6 = self.down5(x5)
	if self.g2l:
	g2l_feat5 = self.g2l5(coarse_feat_whole[0], crop_area_resize[0])
	g2l_feat5 = torch_roi_align(g2l_feat5, bbox, (12, 16), 12/384, aligned=True)
	x6 = self.conv5(torch.cat([x6, g2l_feat5], dim=1))

	x5 = self.up1(torch.cat([x6, guide_cat[0]], dim=1), x5)
	if self.g2l:
	g2l_feat4 = self.g2l4(coarse_feat_whole[1], crop_area_resize[1])
	g2l_feat4 = torch_roi_align(g2l_feat4, bbox, (24, 32), 24/384, aligned=True)
	x5 = self.conv4(torch.cat([x5, g2l_feat4], dim=1))

	x4 = self.up2(torch.cat([x5, guide_cat[1]], dim=1), x4)
	if self.g2l:
	g2l_feat3 = self.g2l3(coarse_feat_whole[2], crop_area_resize[2])
	g2l_feat3 = torch_roi_align(g2l_feat3, bbox, (48, 64), 48/384, aligned=True)
	x4 = self.conv3(torch.cat([x4, g2l_feat3], dim=1))

	x3 = self.up3(torch.cat([x4, guide_cat[2]], dim=1), x3)
	if self.g2l:
	g2l_feat2 = self.g2l2(coarse_feat_whole[3], crop_area_resize[3])
	g2l_feat2 = torch_roi_align(g2l_feat2, bbox, (96, 128), 96/384, aligned=True)
	x3 = self.conv2(torch.cat([x3, g2l_feat2], dim=1))

	x2 = self.up4(torch.cat([x3, guide_cat[3]], dim=1), x2)
	if self.g2l:
	g2l_feat1 = self.g2l1(coarse_feat_whole[4], crop_area_resize[4])
	g2l_feat1 = torch_roi_align(g2l_feat1, bbox, (192, 256), 192/384, aligned=True)
	x2 = self.conv1(torch.cat([x2, g2l_feat1], dim=1))

	x1 = self.up5(torch.cat([x2, guide_cat[4]], dim=1), x1)
	if self.g2l:
	g2l_feat0 = self.g2l0(coarse_feat_whole[5], crop_area_resize[5])
	g2l_feat0 = torch_roi_align(g2l_feat0, bbox, (384, 512), 384/384, aligned=True)
	x1 = self.conv0(torch.cat([x1, g2l_feat0], dim=1))

	output = [x1, x2, x3, x4, x5, x6]
	return output