Spaces:

KyanChen
/

ai-photo-gallery

Runtime error

App Files Files Community

ai-photo-gallery / mmcls /models /backbones /mixmim.py

KyanChen

init

f549064 over 1 year ago

raw

history blame contribute delete

18.4 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import List, Optional, Union

	import torch
	from mmcv.cnn import build_norm_layer
	from mmcv.cnn.bricks.drop import DropPath
	from mmcv.cnn.bricks.transformer import PatchEmbed, PatchMerging
	from mmengine.model import BaseModule
	from torch import nn
	from torch.utils.checkpoint import checkpoint

	from mmcls.models.backbones.base_backbone import BaseBackbone
	from mmcls.models.backbones.vision_transformer import TransformerEncoderLayer
	from mmcls.models.utils.attention import WindowMSA
	from mmcls.models.utils.helpers import to_2tuple
	from mmcls.registry import MODELS


	class MixMIMWindowAttention(WindowMSA):
	"""MixMIM Window Attention.

	Compared with WindowMSA, we add some modifications
	in ``forward`` to meet the requirement of MixMIM during
	pretraining.

	Implements one windown attention in MixMIM.
	Args:
	embed_dims (int): The feature dimension.
	window_size (list): The height and width of the window.
	num_heads (int): The number of head in attention.
	qkv_bias (bool): Whether to add bias for qkv in attention modules.
	Defaults to True.
	qk_scale (float, optional): Override default qk scale of
	``head_dim ** -0.5`` if set. Defaults to None.
	attn_drop_rate (float): attention drop rate.
	Defaults to 0.
	proj_drop_rate (float): Probability of an element to be zeroed.
	Defaults to 0.
	init_cfg (dict, optional): Initialization config dict.
	Defaults to None.
	"""

	def __init__(self,
	embed_dims,
	window_size,
	num_heads,
	qkv_bias=True,
	qk_scale=None,
	attn_drop_rate=0.,
	proj_drop_rate=0.,
	init_cfg=None):

	super().__init__(
	embed_dims=embed_dims,
	window_size=window_size,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop_rate,
	proj_drop=proj_drop_rate,
	init_cfg=init_cfg)

	def forward(self, x, mask=None):

	B_, N, C = x.shape
	qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
	C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[
	2] # make torchscript happy (cannot use tensor as tuple)

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	relative_position_bias = self.relative_position_bias_table[
	self.relative_position_index.view(-1)].view(
	self.window_size[0] * self.window_size[1],
	self.window_size[0] * self.window_size[1],
	-1) # WhWw,WhWw,nH
	relative_position_bias = relative_position_bias.permute(
	2, 0, 1).contiguous() # nH, WhWw, WhWw
	attn = attn + relative_position_bias.unsqueeze(0)

	if mask is not None:
	mask = mask.reshape(B_, 1, 1, N)
	mask_new = mask * mask.transpose(
	2, 3) + (1 - mask) * (1 - mask).transpose(2, 3)
	mask_new = 1 - mask_new

	if mask_new.dtype == torch.float16:
	attn = attn - 65500 * mask_new
	else:
	attn = attn - 1e30 * mask_new

	attn = self.softmax(attn)
	else:
	attn = self.softmax(attn)

	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class MixMIMBlock(TransformerEncoderLayer):
	"""MixMIM Block. Implements one block in MixMIM.

	Args:
	embed_dims (int): The feature dimension.
	input_resolution (tuple): Input resolution of this layer.
	num_heads (int): The number of head in attention,
	window_size (list): The height and width of the window.
	mlp_ratio (int): The MLP ration in FFN.
	num_fcs (int): The number of linear layers in a block.
	qkv_bias (bool): Whether to add bias for qkv in attention modules.
	Defaults to True.
	proj_drop_rate (float): Probability of an element to be zeroed.
	Defaults to 0.
	attn_drop_rate (float): attention drop rate.
	Defaults to 0.
	drop_path_rate (float): stochastic depth rate.
	Defaults to 0.
	norm_cfg (dict): Config dict for normalization layer.
	Defaults to ``dict(type='LN')``.
	init_cfg (dict, optional): Initialization config dict.
	Defaults to None.
	"""

	def __init__(self,
	embed_dims,
	input_resolution,
	num_heads,
	window_size=7,
	mlp_ratio=4.,
	num_fcs=2,
	qkv_bias=True,
	proj_drop_rate=0.,
	attn_drop_rate=0.,
	drop_path_rate=0.,
	act_cfg=dict(type='GELU'),
	norm_cfg=dict(type='LN'),
	init_cfg: Optional[Union[List[dict], dict]] = None) -> None:

	super().__init__(
	embed_dims=embed_dims,
	num_heads=num_heads,
	feedforward_channels=int(mlp_ratio * embed_dims),
	drop_rate=proj_drop_rate,
	attn_drop_rate=attn_drop_rate,
	drop_path_rate=drop_path_rate,
	num_fcs=num_fcs,
	qkv_bias=qkv_bias,
	act_cfg=act_cfg,
	norm_cfg=norm_cfg,
	init_cfg=init_cfg)

	self.embed_dims = embed_dims
	self.input_resolution = input_resolution
	self.num_heads = num_heads
	self.window_size = window_size
	self.mlp_ratio = mlp_ratio

	if min(self.input_resolution) <= self.window_size:
	self.window_size = min(self.input_resolution)

	self.attn = MixMIMWindowAttention(
	embed_dims=embed_dims,
	window_size=to_2tuple(self.window_size),
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	attn_drop_rate=attn_drop_rate,
	proj_drop_rate=proj_drop_rate)

	self.drop_path = DropPath(
	drop_path_rate) if drop_path_rate > 0. else nn.Identity()

	@staticmethod
	def window_reverse(windows, H, W, window_size):
	B = int(windows.shape[0] / (H * W / window_size / window_size))
	x = windows.view(B, H // window_size, W // window_size, window_size,
	window_size, -1)
	x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
	return x

	@staticmethod
	def window_partition(x, window_size):
	B, H, W, C = x.shape
	x = x.view(B, H // window_size, window_size, W // window_size,
	window_size, C)
	windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
	windows = windows.view(-1, window_size, window_size, C)
	return windows

	def forward(self, x, attn_mask=None):
	H, W = self.input_resolution
	B, L, C = x.shape

	shortcut = x
	x = self.norm1(x)
	x = x.view(B, H, W, C)

	# partition windows
	x_windows = self.window_partition(
	x, self.window_size) # nW*B, window_size, window_size, C
	x_windows = x_windows.view(-1, self.window_size * self.window_size,
	C) # nWB, window_sizewindow_size, C
	if attn_mask is not None:
	attn_mask = attn_mask.repeat(B, 1, 1) # B, N, 1
	attn_mask = attn_mask.view(B, H, W, 1)
	attn_mask = self.window_partition(attn_mask, self.window_size)
	attn_mask = attn_mask.view(-1, self.window_size * self.window_size,
	1)

	# W-MSA/SW-MSA
	attn_windows = self.attn(
	x_windows, mask=attn_mask) # nWB, window_sizewindow_size, C

	# merge windows
	attn_windows = attn_windows.view(-1, self.window_size,
	self.window_size, C)
	x = self.window_reverse(attn_windows, H, W,
	self.window_size) # B H' W' C

	x = x.view(B, H * W, C)

	x = shortcut + self.drop_path(x)

	x = self.ffn(self.norm2(x), identity=x) # ffn contains DropPath

	return x


	class MixMIMLayer(BaseModule):
	"""Implements one MixMIM layer, which may contains several MixMIM blocks.

	Args:
	embed_dims (int): The feature dimension.
	input_resolution (tuple): Input resolution of this layer.
	depth (int): The number of blocks in this layer.
	num_heads (int): The number of head in attention,
	window_size (list): The height and width of the window.
	mlp_ratio (int): The MLP ration in FFN.
	qkv_bias (bool): Whether to add bias for qkv in attention modules.
	Defaults to True.
	proj_drop_rate (float): Probability of an element to be zeroed.
	Defaults to 0.
	attn_drop_rate (float): attention drop rate.
	Defaults to 0.
	drop_path_rate (float): stochastic depth rate.
	Defaults to 0.
	norm_cfg (dict): Config dict for normalization layer.
	Defaults to ``dict(type='LN')``.
	downsample (class, optional): Downsample the output of blocks b
	y patch merging.Defaults to None.
	use_checkpoint (bool): Whether use the checkpoint to
	reduce GPU memory cost.
	init_cfg (dict, optional): Initialization config dict.
	Defaults to None.
	"""

	def __init__(self,
	embed_dims: int,
	input_resolution: int,
	depth: int,
	num_heads: int,
	window_size: int,
	mlp_ratio=4.,
	qkv_bias=True,
	proj_drop_rate=0.,
	attn_drop_rate=0.,
	drop_path_rate=[0.],
	norm_cfg=dict(type='LN'),
	downsample=None,
	use_checkpoint=False,
	init_cfg: Optional[Union[List[dict], dict]] = None) -> None:
	super().__init__(init_cfg=init_cfg)
	self.embed_dims = embed_dims
	self.input_resolution = input_resolution
	self.depth = depth
	self.use_checkpoint = use_checkpoint

	# build blocks
	self.blocks = nn.ModuleList()
	for i in range(depth):
	self.blocks.append(
	MixMIMBlock(
	embed_dims=embed_dims,
	input_resolution=input_resolution,
	num_heads=num_heads,
	window_size=window_size,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	proj_drop_rate=proj_drop_rate,
	attn_drop_rate=attn_drop_rate,
	drop_path_rate=drop_path_rate[i],
	norm_cfg=norm_cfg))
	# patch merging layer
	if downsample is not None:
	self.downsample = downsample(
	in_channels=embed_dims,
	out_channels=2 * embed_dims,
	norm_cfg=norm_cfg)
	else:
	self.downsample = None

	def forward(self, x, attn_mask=None):
	for blk in self.blocks:
	if self.use_checkpoint:
	x = checkpoint(blk, x, attn_mask)
	else:
	x = blk(x, attn_mask=attn_mask)
	if self.downsample is not None:
	x, _ = self.downsample(x, self.input_resolution)
	return x

	def extra_repr(self) -> str:
	return f'dim={self.embed_dims}, \
	input_resolution={self.input_resolution}, depth={self.depth}'


	@MODELS.register_module()
	class MixMIMTransformer(BaseBackbone):
	"""MixMIM backbone.

	A PyTorch implement of : ` MixMIM: Mixed and Masked Image
	Modeling for Efficient Visual Representation Learning
	<https://arxiv.org/abs/2205.13137>`_

	Args:
	arch (str \| dict): MixMIM architecture. If use string,
	choose from 'base','large' and 'huge'.
	If use dict, it should have below keys:

	- embed_dims (int): The dimensions of embedding.
	- depths (int): The number of transformer encoder layers.
	- num_heads (int): The number of heads in attention modules.

	Defaults to 'base'.
	mlp_ratio (int): The mlp ratio in FFN. Defaults to 4.
	img_size (int \| tuple): The expected input image shape. Because we
	support dynamic input shape, just set the argument to mlp_ratio
	the most common input image shape. Defaults to 224.
	patch_size (int \| tuple): The patch size in patch embedding.
	Defaults to 16.
	in_channels (int): The num of input channels. Defaults to 3.
	window_size (list): The height and width of the window.
	qkv_bias (bool): Whether to add bias for qkv in attention modules.
	Defaults to True.
	patch_cfg (dict): Extra config dict for patch embedding.
	Defaults to an empty dict.
	norm_cfg (dict): Config dict for normalization layer.
	Defaults to ``dict(type='LN')``.
	drop_rate (float): Probability of an element to be zeroed.
	Defaults to 0.
	drop_path_rate (float): stochastic depth rate. Defaults to 0.
	attn_drop_rate (float): attention drop rate. Defaults to 0.
	use_checkpoint (bool): Whether use the checkpoint to
	reduce GPU memory cost.
	init_cfg (dict, optional): Initialization config dict.
	Defaults to None.
	"""
	arch_zoo = {
	**dict.fromkeys(
	['b', 'base'], {
	'embed_dims': 128,
	'depths': [2, 2, 18, 2],
	'num_heads': [4, 8, 16, 32]
	}),
	**dict.fromkeys(
	['l', 'large'], {
	'embed_dims': 192,
	'depths': [2, 2, 18, 2],
	'num_heads': [6, 12, 24, 48]
	}),
	**dict.fromkeys(
	['h', 'huge'], {
	'embed_dims': 352,
	'depths': [2, 2, 18, 2],
	'num_heads': [11, 22, 44, 88]
	}),
	}

	def __init__(
	self,
	arch='base',
	mlp_ratio=4,
	img_size=224,
	patch_size=4,
	in_channels=3,
	window_size=[14, 14, 14, 7],
	qkv_bias=True,
	patch_cfg=dict(),
	norm_cfg=dict(type='LN'),
	drop_rate=0.0,
	drop_path_rate=0.0,
	attn_drop_rate=0.0,
	use_checkpoint=False,
	init_cfg: Optional[dict] = None,
	) -> None:
	super(MixMIMTransformer, self).__init__(init_cfg=init_cfg)

	if isinstance(arch, str):
	arch = arch.lower()
	assert arch in set(self.arch_zoo), \
	f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
	self.arch_settings = self.arch_zoo[arch]
	else:
	essential_keys = {
	'embed_dims', 'num_layers', 'num_heads', 'feedforward_channels'
	}
	assert isinstance(arch, dict) and essential_keys <= set(arch), \
	f'Custom arch needs a dict with keys {essential_keys}'
	self.arch_settings = arch

	self.embed_dims = self.arch_settings['embed_dims']
	self.depths = self.arch_settings['depths']
	self.num_heads = self.arch_settings['num_heads']

	self.encoder_stride = 32

	self.num_layers = len(self.depths)
	self.qkv_bias = qkv_bias
	self.drop_rate = drop_rate
	self.attn_drop_rate = attn_drop_rate
	self.use_checkpoint = use_checkpoint
	self.mlp_ratio = mlp_ratio
	self.window_size = window_size

	_patch_cfg = dict(
	in_channels=in_channels,
	input_size=img_size,
	embed_dims=self.embed_dims,
	conv_type='Conv2d',
	kernel_size=patch_size,
	stride=patch_size,
	norm_cfg=dict(type='LN'),
	)
	_patch_cfg.update(patch_cfg)
	self.patch_embed = PatchEmbed(**_patch_cfg)
	self.patch_resolution = self.patch_embed.init_out_size

	self.dpr = [
	x.item()
	for x in torch.linspace(0, drop_path_rate, sum(self.depths))
	]
	self.layers = nn.ModuleList()
	for i_layer in range(self.num_layers):
	self.layers.append(
	MixMIMLayer(
	embed_dims=int(self.embed_dims * 2**i_layer),
	input_resolution=(self.patch_resolution[0] // (2**i_layer),
	self.patch_resolution[1] //
	(2**i_layer)),
	depth=self.depths[i_layer],
	num_heads=self.num_heads[i_layer],
	window_size=self.window_size[i_layer],
	mlp_ratio=self.mlp_ratio,
	qkv_bias=self.qkv_bias,
	proj_drop_rate=self.drop_rate,
	attn_drop_rate=self.attn_drop_rate,
	drop_path_rate=self.dpr[sum(self.depths[:i_layer]
	):sum(self.depths[:i_layer +
	1])],
	norm_cfg=norm_cfg,
	downsample=PatchMerging if
	(i_layer < self.num_layers - 1) else None,
	use_checkpoint=self.use_checkpoint))

	self.num_features = int(self.embed_dims * 2**(self.num_layers - 1))
	self.drop_after_pos = nn.Dropout(p=self.drop_rate)

	self.avgpool = nn.AdaptiveAvgPool1d(1)
	self.num_patches = self.patch_resolution[0] * self.patch_resolution[1]
	self.absolute_pos_embed = nn.Parameter(
	torch.zeros(1, self.num_patches, self.embed_dims),
	requires_grad=False)

	_, self.norm = build_norm_layer(norm_cfg, self.num_features)

	def forward(self, x: torch.Tensor):
	x, _ = self.patch_embed(x)

	x = x + self.absolute_pos_embed
	x = self.drop_after_pos(x)

	for layer in self.layers:
	x = layer(x, attn_mask=None)

	x = self.norm(x)
	x = self.avgpool(x.transpose(1, 2)) # B C 1
	x = torch.flatten(x, 1)

	return (x, )