Spaces:
Runtime error
Runtime error
File size: 3,619 Bytes
f549064 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# Copyright (c) OpenMMLab. All rights reserved.
import math
from collections import OrderedDict
from typing import List, Optional, Tuple
import torch
import torch.nn as nn
from mmcv.cnn import build_activation_layer
from mmengine.model import Sequential
from mmengine.model.weight_init import trunc_normal_
from mmcls.registry import MODELS
from .cls_head import ClsHead
@MODELS.register_module()
class VisionTransformerClsHead(ClsHead):
"""Vision Transformer classifier head.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
hidden_dim (int, optional): Number of the dimensions for hidden layer.
Defaults to None, which means no extra hidden layer.
act_cfg (dict): The activation config. Only available during
pre-training. Defaults to ``dict(type='Tanh')``.
init_cfg (dict): The extra initialization configs. Defaults to
``dict(type='Constant', layer='Linear', val=0)``.
"""
def __init__(self,
num_classes: int,
in_channels: int,
hidden_dim: Optional[int] = None,
act_cfg: dict = dict(type='Tanh'),
init_cfg: dict = dict(type='Constant', layer='Linear', val=0),
**kwargs):
super(VisionTransformerClsHead, self).__init__(
init_cfg=init_cfg, **kwargs)
self.in_channels = in_channels
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.act_cfg = act_cfg
if self.num_classes <= 0:
raise ValueError(
f'num_classes={num_classes} must be a positive integer')
self._init_layers()
def _init_layers(self):
""""Init hidden layer if exists."""
if self.hidden_dim is None:
layers = [('head', nn.Linear(self.in_channels, self.num_classes))]
else:
layers = [
('pre_logits', nn.Linear(self.in_channels, self.hidden_dim)),
('act', build_activation_layer(self.act_cfg)),
('head', nn.Linear(self.hidden_dim, self.num_classes)),
]
self.layers = Sequential(OrderedDict(layers))
def init_weights(self):
""""Init weights of hidden layer if exists."""
super(VisionTransformerClsHead, self).init_weights()
# Modified from ClassyVision
if hasattr(self.layers, 'pre_logits'):
# Lecun norm
trunc_normal_(
self.layers.pre_logits.weight,
std=math.sqrt(1 / self.layers.pre_logits.in_features))
nn.init.zeros_(self.layers.pre_logits.bias)
def pre_logits(self, feats: Tuple[List[torch.Tensor]]) -> torch.Tensor:
"""The process before the final classification head.
The input ``feats`` is a tuple of list of tensor, and each tensor is
the feature of a backbone stage. In ``VisionTransformerClsHead``, we
obtain the feature of the last stage and forward in hidden layer if
exists.
"""
_, cls_token = feats[-1]
if self.hidden_dim is None:
return cls_token
else:
x = self.layers.pre_logits(cls_token)
return self.layers.act(x)
def forward(self, feats: Tuple[List[torch.Tensor]]) -> torch.Tensor:
"""The forward process."""
pre_logits = self.pre_logits(feats)
# The final classification head.
cls_score = self.layers.head(pre_logits)
return cls_score
|