# Copyright (c) OpenMMLab. All rights reserved. from typing import Sequence import torch import torch.nn as nn from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer from mmengine.model import BaseModule from mmcls.registry import MODELS from .base_backbone import BaseBackbone class PatchEmbed(nn.Module): """Patch Embedding module implemented by a layer of convolution. Input: tensor in shape [B, C, H, W] Output: tensor in shape [B, C, H/stride, W/stride] Args: patch_size (int): Patch size of the patch embedding. Defaults to 16. stride (int): Stride of the patch embedding. Defaults to 16. padding (int): Padding of the patch embedding. Defaults to 0. in_chans (int): Input channels. Defaults to 3. embed_dim (int): Output dimension of the patch embedding. Defaults to 768. norm_layer (module): Normalization module. Defaults to None (not use). """ def __init__(self, patch_size=16, stride=16, padding=0, in_chans=3, embed_dim=768, norm_layer=None): super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x = self.proj(x) x = self.norm(x) return x class Pooling(nn.Module): """Pooling module. Args: pool_size (int): Pooling size. Defaults to 3. """ def __init__(self, pool_size=3): super().__init__() self.pool = nn.AvgPool2d( pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) def forward(self, x): return self.pool(x) - x class Mlp(nn.Module): """Mlp implemented by with 1*1 convolutions. Input: Tensor with shape [B, C, H, W]. Output: Tensor with shape [B, C, H, W]. Args: in_features (int): Dimension of input features. hidden_features (int): Dimension of hidden features. out_features (int): Dimension of output features. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. drop (float): Dropout rate. Defaults to 0.0. """ def __init__(self, in_features, hidden_features=None, out_features=None, act_cfg=dict(type='GELU'), drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = build_activation_layer(act_cfg) self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class PoolFormerBlock(BaseModule): """PoolFormer Block. Args: dim (int): Embedding dim. pool_size (int): Pooling size. Defaults to 3. mlp_ratio (float): Mlp expansion ratio. Defaults to 4. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='GN', num_groups=1)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. drop (float): Dropout rate. Defaults to 0. drop_path (float): Stochastic depth rate. Defaults to 0. layer_scale_init_value (float): Init value for Layer Scale. Defaults to 1e-5. """ def __init__(self, dim, pool_size=3, mlp_ratio=4., norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), drop=0., drop_path=0., layer_scale_init_value=1e-5): super().__init__() self.norm1 = build_norm_layer(norm_cfg, dim)[1] self.token_mixer = Pooling(pool_size=pool_size) self.norm2 = build_norm_layer(norm_cfg, dim)[1] mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_cfg=act_cfg, drop=drop) # The following two techniques are useful to train deep PoolFormers. self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) def forward(self, x): x = x + self.drop_path( self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.token_mixer(self.norm1(x))) x = x + self.drop_path( self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) return x def basic_blocks(dim, index, layers, pool_size=3, mlp_ratio=4., norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), drop_rate=.0, drop_path_rate=0., layer_scale_init_value=1e-5): """ generate PoolFormer blocks for a stage return: PoolFormer blocks """ blocks = [] for block_idx in range(layers[index]): block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / ( sum(layers) - 1) blocks.append( PoolFormerBlock( dim, pool_size=pool_size, mlp_ratio=mlp_ratio, norm_cfg=norm_cfg, act_cfg=act_cfg, drop=drop_rate, drop_path=block_dpr, layer_scale_init_value=layer_scale_init_value, )) blocks = nn.Sequential(*blocks) return blocks @MODELS.register_module() class PoolFormer(BaseBackbone): """PoolFormer. A PyTorch implementation of PoolFormer introduced by: `MetaFormer is Actually What You Need for Vision `_ Modified from the `official repo `. Args: arch (str | dict): The model's architecture. If string, it should be one of architecture in ``PoolFormer.arch_settings``. And if dict, it should include the following two keys: - layers (list[int]): Number of blocks at each stage. - embed_dims (list[int]): The number of channels at each stage. - mlp_ratios (list[int]): Expansion ratio of MLPs. - layer_scale_init_value (float): Init value for Layer Scale. Defaults to 'S12'. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='LN2d', eps=1e-6)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. in_patch_size (int): The patch size of input image patch embedding. Defaults to 7. in_stride (int): The stride of input image patch embedding. Defaults to 4. in_pad (int): The padding of input image patch embedding. Defaults to 2. down_patch_size (int): The patch size of downsampling patch embedding. Defaults to 3. down_stride (int): The stride of downsampling patch embedding. Defaults to 2. down_pad (int): The padding of downsampling patch embedding. Defaults to 1. drop_rate (float): Dropout rate. Defaults to 0. drop_path_rate (float): Stochastic depth rate. Defaults to 0. out_indices (Sequence | int): Output from which network position. Index 0-6 respectively corresponds to [stage1, downsampling, stage2, downsampling, stage3, downsampling, stage4] Defaults to -1, means the last stage. frozen_stages (int): Stages to be frozen (all param fixed). Defaults to 0, which means not freezing any parameters. init_cfg (dict, optional): Initialization config dict """ # noqa: E501 # --layers: [x,x,x,x], numbers of layers for the four stages # --embed_dims, --mlp_ratios: # embedding dims and mlp ratios for the four stages # --downsamples: flags to apply downsampling or not in four blocks arch_settings = { 's12': { 'layers': [2, 2, 6, 2], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-5, }, 's24': { 'layers': [4, 4, 12, 4], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-5, }, 's36': { 'layers': [6, 6, 18, 6], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, 'm36': { 'layers': [6, 6, 18, 6], 'embed_dims': [96, 192, 384, 768], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, 'm48': { 'layers': [8, 8, 24, 8], 'embed_dims': [96, 192, 384, 768], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, } def __init__(self, arch='s12', pool_size=3, norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), in_patch_size=7, in_stride=4, in_pad=2, down_patch_size=3, down_stride=2, down_pad=1, drop_rate=0., drop_path_rate=0., out_indices=-1, frozen_stages=0, init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(arch, str): assert arch in self.arch_settings, \ f'Unavailable arch, please choose from ' \ f'({set(self.arch_settings)}) or pass a dict.' arch = self.arch_settings[arch] elif isinstance(arch, dict): assert 'layers' in arch and 'embed_dims' in arch, \ f'The arch dict must have "layers" and "embed_dims", ' \ f'but got {list(arch.keys())}.' layers = arch['layers'] embed_dims = arch['embed_dims'] mlp_ratios = arch['mlp_ratios'] \ if 'mlp_ratios' in arch else [4, 4, 4, 4] layer_scale_init_value = arch['layer_scale_init_value'] \ if 'layer_scale_init_value' in arch else 1e-5 self.patch_embed = PatchEmbed( patch_size=in_patch_size, stride=in_stride, padding=in_pad, in_chans=3, embed_dim=embed_dims[0]) # set the main block in network network = [] for i in range(len(layers)): stage = basic_blocks( embed_dims[i], i, layers, pool_size=pool_size, mlp_ratio=mlp_ratios[i], norm_cfg=norm_cfg, act_cfg=act_cfg, drop_rate=drop_rate, drop_path_rate=drop_path_rate, layer_scale_init_value=layer_scale_init_value) network.append(stage) if i >= len(layers) - 1: break if embed_dims[i] != embed_dims[i + 1]: # downsampling between two stages network.append( PatchEmbed( patch_size=down_patch_size, stride=down_stride, padding=down_pad, in_chans=embed_dims[i], embed_dim=embed_dims[i + 1])) self.network = nn.ModuleList(network) if isinstance(out_indices, int): out_indices = [out_indices] assert isinstance(out_indices, Sequence), \ f'"out_indices" must by a sequence or int, ' \ f'get {type(out_indices)} instead.' for i, index in enumerate(out_indices): if index < 0: out_indices[i] = 7 + index assert out_indices[i] >= 0, f'Invalid out_indices {index}' self.out_indices = out_indices if self.out_indices: for i_layer in self.out_indices: layer = build_norm_layer(norm_cfg, embed_dims[(i_layer + 1) // 2])[1] layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.frozen_stages = frozen_stages self._freeze_stages() def forward_embeddings(self, x): x = self.patch_embed(x) return x def forward_tokens(self, x): outs = [] for idx, block in enumerate(self.network): x = block(x) if idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(x) outs.append(x_out) return tuple(outs) def forward(self, x): # input embedding x = self.forward_embeddings(x) # through backbone x = self.forward_tokens(x) return x def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False for i in range(self.frozen_stages): # Include both block and downsample layer. module = self.network[i] module.eval() for param in module.parameters(): param.requires_grad = False if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') norm_layer.eval() for param in norm_layer.parameters(): param.requires_grad = False def train(self, mode=True): super(PoolFormer, self).train(mode) self._freeze_stages()