Spaces:

Junfeng5
/

GLEE_demo

Runtime error

App Files Files Community

wjf5203 commited on Dec 15, 2023

Commit

2aac0e2

1 Parent(s): acefb81

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
GLEE/.DS_Store +0 -0
GLEE/clip_vit_base_patch32/config.json +157 -0
GLEE/clip_vit_base_patch32/merges.txt +0 -0
GLEE/clip_vit_base_patch32/preprocessor_config.json +19 -0
GLEE/clip_vit_base_patch32/pytorch_model.bin +3 -0
GLEE/clip_vit_base_patch32/special_tokens_map.json +1 -0
GLEE/clip_vit_base_patch32/tokenizer.json +0 -0
GLEE/clip_vit_base_patch32/tokenizer_config.json +1 -0
GLEE/clip_vit_base_patch32/vocab.json +0 -0
GLEE/configs/R50.yaml +71 -0
GLEE/configs/SwinL.yaml +79 -0
GLEE/glee/.DS_Store +0 -0
GLEE/glee/__init__.py +12 -0
GLEE/glee/backbone/__init__.py +7 -0
GLEE/glee/backbone/backbone.py +51 -0
GLEE/glee/backbone/build.py +11 -0
GLEE/glee/backbone/davit.py +623 -0
GLEE/glee/backbone/eva01.py +676 -0
GLEE/glee/backbone/eva02-dino.py +598 -0
GLEE/glee/backbone/eva02.py +647 -0
GLEE/glee/backbone/eva_01_utils.py +222 -0
GLEE/glee/backbone/eva_02_utils.py +356 -0
GLEE/glee/backbone/internimage.py +737 -0
GLEE/glee/backbone/registry.py +14 -0
GLEE/glee/backbone/resnet.py +731 -0
GLEE/glee/backbone/swin.py +783 -0
GLEE/glee/backbone/vit.py +472 -0
GLEE/glee/backbone/vit_utils.py +222 -0
GLEE/glee/config.py +387 -0
GLEE/glee/config_deeplab.py +28 -0
GLEE/glee/models/.DS_Store +0 -0
GLEE/glee/models/glee_model.py +296 -0
GLEE/glee/models/pixel_decoder/__init__.py +1 -0
GLEE/glee/models/pixel_decoder/__pycache__/__init__.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/__init__.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/early_fusion.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/early_fusion.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/maskdino_encoder.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/maskdino_encoder.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/position_encoding.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/__pycache__/position_encoding.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/early_fusion.py +230 -0
GLEE/glee/models/pixel_decoder/maskdino_encoder.py +463 -0
GLEE/glee/models/pixel_decoder/ops/functions/__init__.py +13 -0
GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/__init__.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/__init__.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc +0 -0
GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc +0 -0
GLEE/glee/models/pixel_decoder/ops/functions/ms_deform_attn_func.py +72 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

GLEE/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

GLEE/clip_vit_base_patch32/config.json ADDED Viewed

	@@ -0,0 +1,157 @@

+{
+  "_name_or_path": "openai/clip-vit-base-patch32",
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 512,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 8,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "projection_dim": 512,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": null,
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 32,
+    "prefix": null,
+    "projection_dim" : 512,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false
+  },
+  "vision_config_dict": null
+}

GLEE/clip_vit_base_patch32/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

GLEE/clip_vit_base_patch32/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}

GLEE/clip_vit_base_patch32/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a63082132ba4f97a80bea76823f544493bffa8082296d62d71581a4feff1576f
+size 605247071

GLEE/clip_vit_base_patch32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": {"content": "<\|startoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<\|endoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<\|endoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<\|endoftext\|>"}

GLEE/clip_vit_base_patch32/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

GLEE/clip_vit_base_patch32/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "./clip_ViT_B_32/"}

GLEE/clip_vit_base_patch32/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

GLEE/configs/R50.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+MODEL:
+  META_ARCHITECTURE: "GLEE"
+  MASK_ON: True
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+  SEM_SEG_HEAD:
+    NAME: "MaskDINOHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 80
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MaskDINOEncoder"
+    DIM_FEEDFORWARD: 2048
+    NUM_FEATURE_LEVELS: 3
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+    FEATURE_ORDER: "low2high"
+  MaskDINO:
+    TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 4.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 300
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    INITIAL_PRED: True
+    TWO_STAGE: True
+    DN: "standard"
+    DN_NUM: 100
+    INITIALIZE_BOX_TYPE: "no"
+    TEST:
+      SEMANTIC_ON: False
+      INSTANCE_ON: True
+      PANOPTIC_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.25
+  TEXT:
+    ARCH: clip_teacher
+  LANGUAGE_BACKBONE:
+    LANG_DIM: 512

GLEE/configs/SwinL.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+MODEL:
+  META_ARCHITECTURE: "GLEE"
+  MASK_ON: True
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+  SEM_SEG_HEAD:
+    NAME: "MaskDINOHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 80
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MaskDINOEncoder"
+    DIM_FEEDFORWARD: 2048
+    NUM_FEATURE_LEVELS: 3
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+    FEATURE_ORDER: "low2high"
+  MaskDINO:
+    TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 4.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 300
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    INITIAL_PRED: True
+    TWO_STAGE: True
+    DN: "standard"
+    DN_NUM: 100
+    INITIALIZE_BOX_TYPE: "no"
+    TEST:
+      SEMANTIC_ON: False
+      INSTANCE_ON: True
+      PANOPTIC_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.25
+  TEXT:
+    ARCH: clip_teacher
+  LANGUAGE_BACKBONE:
+    LANG_DIM: 512

GLEE/glee/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

GLEE/glee/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .config import add_glee_config
+from .config_deeplab import add_deeplab_config
+# from .GLEE import GLEE
+# from .data import build_detection_train_loader, build_detection_test_loader
+from .backbone.swin import D2SwinTransformer
+from .backbone.eva02 import D2_EVA02

GLEE/glee/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .build import build_backbone
+from .resnet import *
+from .swin import *
+# from .focal import *
+# from .focal_dw import *
+from .backbone import *

GLEE/glee/backbone/backbone.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch.nn as nn
+from detectron2.modeling import ShapeSpec
+__all__ = ["Backbone"]
+class Backbone(nn.Module):
+    """
+    Abstract base class for network backbones.
+    """
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+        Returns:
+            dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+    @property
+    def size_divisibility(self) -> int:
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }

GLEE/glee/backbone/build.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .registry import model_entrypoints
+from .registry import is_model
+from .backbone import *
+def build_backbone(config, **kwargs):
+    model_name = config['MODEL']['BACKBONE']['NAME']
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    model = model_entrypoints(model_name)(config, **kwargs)
+    return model

GLEE/glee/backbone/davit.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import os
+import itertools
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from collections import OrderedDict
+from einops import rearrange
+from timm.models.layers import DropPath, trunc_normal_
+from detectron2.utils.file_io import PathManager
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .registry import register_backbone
+logger = logging.getLogger(__name__)
+class MySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._modules.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+class PreNorm(nn.Module):
+    def __init__(self, norm, fn, drop_path=None):
+        super().__init__()
+        self.norm = norm
+        self.fn = fn
+        self.drop_path = drop_path
+    def forward(self, x, *args, **kwargs):
+        shortcut = x
+        if self.norm != None:
+            x, size = self.fn(self.norm(x), *args, **kwargs)
+        else:
+            x, size = self.fn(x, *args, **kwargs)
+        if self.drop_path:
+            x = self.drop_path(x)
+        x = shortcut + x
+        return x, size
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.net = nn.Sequential(OrderedDict([
+            ("fc1", nn.Linear(in_features, hidden_features)),
+            ("act", act_layer()),
+            ("fc2", nn.Linear(hidden_features, out_features))
+        ]))
+    def forward(self, x, size):
+        return self.net(x), size
+class DepthWiseConv2d(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        kernel_size,
+        padding,
+        stride,
+        bias=True,
+    ):
+        super().__init__()
+        self.dw = nn.Conv2d(
+            dim_in, dim_in,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=dim_in,
+            stride=stride,
+            bias=bias
+        )
+    def forward(self, x, size):
+        B, N, C = x.shape
+        H, W = size
+        assert N == H * W
+        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
+        size = (x.size(-2), x.size(-1))
+        x = x.flatten(2).transpose(1, 2)
+        return x, size
+class ConvEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(
+        self,
+        patch_size=7,
+        in_chans=3,
+        embed_dim=64,
+        stride=4,
+        padding=2,
+        norm_layer=None,
+        pre_norm=True
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding
+        )
+        dim_norm = in_chans if pre_norm else embed_dim
+        self.norm = norm_layer(dim_norm) if norm_layer else None
+        self.pre_norm = pre_norm
+    def forward(self, x, size):
+        H, W = size
+        if len(x.size()) == 3:
+            if self.norm and self.pre_norm:
+                x = self.norm(x)
+            x = rearrange(
+                x, 'b (h w) c -> b c h w',
+                h=H, w=W
+            )
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm and not self.pre_norm:
+            x = self.norm(x)
+        return x, (H, W)
+class ChannelAttention(nn.Module):
+    def __init__(self, dim, groups=8, qkv_bias=True):
+        super().__init__()
+        self.groups = groups
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x, size):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * (N ** -0.5)
+        attention = q.transpose(-1, -2) @ k
+        attention = attention.softmax(dim=-1)
+        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x, size
+class ChannelBlock(nn.Module):
+    def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
+                 drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 conv_at_attn=True, conv_at_ffn=True):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        self.channel_attn = PreNorm(
+            norm_layer(dim),
+            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
+            drop_path
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
+            drop_path
+        )
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.channel_attn(x, size)
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+def window_partition(x, window_size: int):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size: int, H: int, W: int):
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    def __init__(self, dim, num_heads, window_size, qkv_bias=True):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, size):
+        H, W = size
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        x = window_partition(x, self.window_size)
+        x = x.view(-1, self.window_size * self.window_size, C)
+        # W-MSA/SW-MSA
+        # attn_windows = self.attn(x_windows)
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = self.softmax(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        # merge windows
+        x = x.view(
+            -1, self.window_size, self.window_size, C
+        )
+        x = window_reverse(x, self.window_size, Hp, Wp)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        return x, size
+class SpatialBlock(nn.Module):
+    def __init__(self, dim, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        self.window_attn = PreNorm(
+            norm_layer(dim),
+            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
+            drop_path
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
+            drop_path
+        )
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.window_attn(x, size)
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+class DaViT(nn.Module):
+    """ DaViT: Dual-Attention Transformer
+    Args:
+        img_size (int): Image size, Default: 224.
+        in_chans (int): Number of input image channels. Default: 3.
+        num_classes (int): Number of classes for classification head. Default: 1000.
+        patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2).
+        patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2).
+        patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0).
+        patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False).
+        embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256).
+        num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16).
+        num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16).
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        enable_checkpoint (bool): If True, enable checkpointing. Default: False.
+        conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True.
+        conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True.
+    """
+    def __init__(
+        self,
+        img_size=224,
+        in_chans=3,
+        num_classes=1000,
+        depths=(1, 1, 3, 1),
+        patch_size=(7, 2, 2, 2),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 0, 0, 0),
+        patch_prenorm=(False, False, False, False),
+        embed_dims=(64, 128, 192, 256),
+        num_heads=(3, 6, 12, 24),
+        num_groups=(3, 6, 12, 24),
+        window_size=7,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        norm_layer=nn.LayerNorm,
+        enable_checkpoint=False,
+        conv_at_attn=True,
+        conv_at_ffn=True,
+        out_indices=[],
+     ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.num_stages = len(self.embed_dims)
+        self.enable_checkpoint = enable_checkpoint
+        assert self.num_stages == len(self.num_heads) == len(self.num_groups)
+        num_stages = len(embed_dims)
+        self.img_size = img_size
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
+        depth_offset = 0
+        convs = []
+        blocks = []
+        for i in range(num_stages):
+            conv_embed = ConvEmbed(
+                patch_size=patch_size[i],
+                stride=patch_stride[i],
+                padding=patch_padding[i],
+                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
+                embed_dim=self.embed_dims[i],
+                norm_layer=norm_layer,
+                pre_norm=patch_prenorm[i]
+            )
+            convs.append(conv_embed)
+            print(f'=> Depth offset in stage {i}: {depth_offset}')
+            block = MySequential(
+                *[
+                    MySequential(OrderedDict([
+                        (
+                            'spatial_block', SpatialBlock(
+                                embed_dims[i],
+                                num_heads[i],
+                                window_size,
+                                drop_path_rate=dpr[depth_offset+j*2],
+                                qkv_bias=qkv_bias,
+                                mlp_ratio=mlp_ratio,
+                                conv_at_attn=conv_at_attn,
+                                conv_at_ffn=conv_at_ffn,
+                            )
+                        ),
+                        (
+                            'channel_block', ChannelBlock(
+                                embed_dims[i],
+                                num_groups[i],
+                                drop_path_rate=dpr[depth_offset+j*2+1],
+                                qkv_bias=qkv_bias,
+                                mlp_ratio=mlp_ratio,
+                                conv_at_attn=conv_at_attn,
+                                conv_at_ffn=conv_at_ffn,
+                            )
+                        )
+                    ])) for j in range(depths[i])
+                ]
+            )
+            blocks.append(block)
+            depth_offset += depths[i]*2
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+        self.out_indices = out_indices
+        # self.norms = norm_layer(self.embed_dims[-1])
+        # self.avgpool = nn.AdaptiveAvgPool1d(1)
+        # self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    @property
+    def dim_out(self):
+        return self.embed_dims[-1]
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.02)
+            for name, _ in m.named_parameters():
+                if name in ['bias']:
+                    nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0)
+    def _try_remap_keys(self, pretrained_dict):
+        remap_keys = {
+            "conv_embeds": "convs",
+            "main_blocks": "blocks",
+            "0.cpe.0.proj": "spatial_block.conv1.fn.dw",
+            "0.attn": "spatial_block.window_attn.fn",
+            "0.cpe.1.proj": "spatial_block.conv2.fn.dw",
+            "0.mlp": "spatial_block.ffn.fn.net",
+            "1.cpe.0.proj": "channel_block.conv1.fn.dw",
+            "1.attn": "channel_block.channel_attn.fn",
+            "1.cpe.1.proj": "channel_block.conv2.fn.dw",
+            "1.mlp": "channel_block.ffn.fn.net",
+            "0.norm1": "spatial_block.window_attn.norm",
+            "0.norm2": "spatial_block.ffn.norm",
+            "1.norm1": "channel_block.channel_attn.norm",
+            "1.norm2": "channel_block.ffn.norm"
+        }
+        full_key_mappings = {}
+        for k in pretrained_dict.keys():
+            old_k = k
+            for remap_key in remap_keys.keys():
+                if remap_key in k:
+                    print(f'=> Repace {remap_key} with {remap_keys[remap_key]}')
+                    k = k.replace(remap_key, remap_keys[remap_key])
+            full_key_mappings[old_k] = k
+        return full_key_mappings
+    def from_state_dict(self, pretrained_dict, pretrained_layers=[], verbose=True):
+        model_dict = self.state_dict()
+        stripped_key = lambda x: x[14:] if x.startswith('image_encoder.') else x
+        full_key_mappings = self._try_remap_keys(pretrained_dict)
+        pretrained_dict = {
+            stripped_key(full_key_mappings[k]): v for k, v in pretrained_dict.items()
+            if stripped_key(full_key_mappings[k]) in model_dict.keys()
+        }
+        need_init_state_dict = {}
+        for k, v in pretrained_dict.items():
+            need_init = (
+                k.split('.')[0] in pretrained_layers
+                or pretrained_layers[0] == '*'
+            )
+            if need_init:
+                if verbose:
+                    print(f'=> init {k} from pretrained state dict')
+                need_init_state_dict[k] = v
+        self.load_state_dict(need_init_state_dict, strict=False)
+    def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            print(f'=> loading pretrained model {pretrained}')
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            self.from_state_dict(pretrained_dict, pretrained_layers, verbose)
+    def forward_features(self, x):
+        input_size = (x.size(2), x.size(3))
+        outs = {}
+        for i, (conv, block) in enumerate(zip(self.convs, self.blocks)):
+            x, input_size = conv(x, input_size)
+            if self.enable_checkpoint:
+                x, input_size = checkpoint.checkpoint(block, x, input_size)
+            else:
+                x, input_size = block(x, input_size)
+            if i in self.out_indices:
+                out = x.view(-1, *input_size, self.embed_dims[i]).permute(0, 3, 1, 2).contiguous()
+                outs["res{}".format(i + 2)] = out
+        if len(self.out_indices) == 0:
+            outs["res5"] = x.view(-1, *input_size, self.embed_dims[-1]).permute(0, 3, 1, 2).contiguous()
+        return outs
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+        return x
+class D2DaViT(DaViT, Backbone):
+    def __init__(self, cfg, input_shape):
+        spec = cfg['BACKBONE']['DAVIT']
+        super().__init__(
+            num_classes=0,
+            depths=spec['DEPTHS'],
+            embed_dims=spec['DIM_EMBED'],
+            num_heads=spec['NUM_HEADS'],
+            num_groups=spec['NUM_GROUPS'],
+            patch_size=spec['PATCH_SIZE'],
+            patch_stride=spec['PATCH_STRIDE'],
+            patch_padding=spec['PATCH_PADDING'],
+            patch_prenorm=spec['PATCH_PRENORM'],
+            drop_path_rate=spec['DROP_PATH_RATE'],
+            img_size=input_shape,
+            window_size=spec.get('WINDOW_SIZE', 7),
+            enable_checkpoint=spec.get('ENABLE_CHECKPOINT', False),
+            conv_at_attn=spec.get('CONV_AT_ATTN', True),
+            conv_at_ffn=spec.get('CONV_AT_FFN', True),
+            out_indices=spec.get('OUT_INDICES', []),
+        )
+        self._out_features = cfg['BACKBONE']['DAVIT']['OUT_FEATURES']
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.embed_dims[0],
+            "res3": self.embed_dims[1],
+            "res4": self.embed_dims[2],
+            "res5": self.embed_dims[3],
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32
+@register_backbone
+def get_davit_backbone(cfg):
+    davit = D2DaViT(cfg['MODEL'], 224)
+    if cfg['MODEL']['BACKBONE']['LOAD_PRETRAINED'] is True:
+        filename = cfg['MODEL']['BACKBONE']['PRETRAINED']
+        logger.info(f'=> init from {filename}')
+        davit.from_pretrained(
+            filename,
+            cfg['MODEL']['BACKBONE']['DAVIT'].get('PRETRAINED_LAYERS', ['*']),
+            cfg['VERBOSE'])
+    return davit

GLEE/glee/backbone/eva01.py ADDED Viewed

	@@ -0,0 +1,676 @@

+import logging
+import math
+from functools import partial
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor, Size
+from typing import Union, List
+from torch.nn.parameter import Parameter
+import numbers
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+from fairscale.nn.checkpoint import checkpoint_wrapper
+from timm.models.layers import DropPath, Mlp, trunc_normal_
+# from detectron2.modeling.backbone import Backbone
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .eva_01_utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+logger = logging.getLogger(__name__)
+__all__ = ["EVAViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+_shape_t = Union[int, List[int], Size]
+# steal from beit https://github.com/microsoft/unilm/tree/master/beit
+class LayerNormWithForceFP32(nn.Module):
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
+    normalized_shape: _shape_t
+    eps: float
+    elementwise_affine: bool
+    def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True) -> None:
+        super(LayerNormWithForceFP32, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.Tensor(*normalized_shape))
+            self.bias = Parameter(torch.Tensor(*normalized_shape))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            nn.init.ones_(self.weight)
+            nn.init.zeros_(self.bias)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).type_as(input)
+    def extra_repr(self) -> Tensor:
+        return '{normalized_shape}, eps={eps}, ' \
+               'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        beit_like_qkv_bias=False,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+        interp_type="vitdet",
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.beit_like_qkv_bias = beit_like_qkv_bias
+        if beit_like_qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(dim))
+            self.v_bias = nn.Parameter(torch.zeros(dim))
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        self.interp_type = interp_type
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+            if not rel_pos_zero_init:
+                trunc_normal_(self.rel_pos_h, std=0.02)
+                trunc_normal_(self.rel_pos_w, std=0.02)
+        self.qk_float = False
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        if self.beit_like_qkv_bias:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+            qkv = torch.nn.functional.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        else:
+            qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        if self.qk_float:
+            attn = (q.float() * self.scale) @ k.float().transpose(-2, -1)
+            if self.use_rel_pos:
+                attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W), self.interp_type)
+            attn = attn.softmax(dim=-1).type_as(x)
+        else:
+            attn = (q * self.scale) @ k.transpose(-2, -1)
+            if self.use_rel_pos:
+                attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W), self.interp_type)
+            attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=LayerNormWithForceFP32,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+        beit_like_qkv_bias=False,
+        beit_like_gamma=False,
+        interp_type="vitdet",
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+            beit_like_qkv_bias (bool)
+            beit_like_gamma (bool)
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+            beit_like_qkv_bias=beit_like_qkv_bias,
+            interp_type=interp_type,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+                act_layer=act_layer,
+            )
+        self.beit_like_gamma = beit_like_gamma
+        if beit_like_gamma:
+            self.gamma_1 = nn.Parameter(torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(torch.ones((dim)), requires_grad=True)
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        if self.beit_like_gamma:
+            x = shortcut + self.drop_path(self.gamma_1 * x)
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = shortcut + self.drop_path(x)
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class EVAViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=LayerNormWithForceFP32,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        beit_like_qkv_bias=True,
+        beit_like_gamma=False,
+        freeze_patch_embed=False,
+        interp_type="vitdet",
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+            beit_like_qkv_bias (bool): beit_like_model that has gamma_1 and gamma_2 in blocks and qkv_bias=False
+            beit_like_gamma (bool)
+            freeze_patch_embed (bool)
+            interp_type: "vitdet" for training / fine-ting, "beit" for eval (slightly improvement at a higher res)
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        if beit_like_qkv_bias:
+            qkv_bias = False
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+                beit_like_qkv_bias=beit_like_qkv_bias,
+                beit_like_gamma=beit_like_gamma,
+                interp_type=interp_type,
+            )
+            if use_act_checkpoint:
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        self.freeze_patch_embed = freeze_patch_embed
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNormWithForceFP32):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        if self.freeze_patch_embed:
+            for n, p in self.patch_embed.named_parameters():
+                p.requires_grad = False
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+        self.scale_factors = scale_factors
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+        for stage in self.stages:
+            results.append(stage(features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+@BACKBONE_REGISTRY.register()
+class D2_EVA01(SimpleFeaturePyramid):
+    def __init__(self, cfg, input_shape):
+        super().__init__(
+            net = EVAViT(
+                img_size= cfg.MODEL.EVA01.IMAGE_SIZE,
+                patch_size=cfg.MODEL.EVA01.PATCH_SIZE,
+                window_size= cfg.MODEL.EVA01.WINDOW_SIZE,
+                embed_dim= cfg.MODEL.EVA01.DMBED_DIM,
+                depth= cfg.MODEL.EVA01.DEPTH,
+                num_heads= cfg.MODEL.EVA01.NUM_HEADS ,
+                drop_path_rate= cfg.MODEL.EVA01.DROP_PATH_RATE,
+                mlp_ratio= cfg.MODEL.EVA01.MLP_RATIO,
+                qkv_bias=True,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                window_block_indexes= cfg.MODEL.EVA01.WINDOW_BLOCK_INDEXES,
+                residual_block_indexes=[],
+                use_act_checkpoint = True,
+                use_rel_pos = True,
+                out_feature="last_feat",
+                beit_like_qkv_bias=cfg.MODEL.EVA01.BEIT_LIKE_QKV_BIAS ,
+                beit_like_gamma= cfg.MODEL.EVA01.BEIT_LIKE_GAMMA,
+                freeze_patch_embed= cfg.MODEL.EVA01.FREEZE_PATH_EMBED,
+            ),
+            in_feature = "last_feat",
+            out_channels=256,
+            scale_factors=(2.0, 1.0, 0.5),  # (4.0, 2.0, 1.0, 0.5) in ViTDet
+            top_block=LastLevelMaxPool(),
+            norm="LN",
+            square_pad=cfg.MODEL.EVA01.IMAGE_SIZE,
+        )
+        pretrained_weight = cfg.MODEL.EVA01.PRETRAINED_WEIGHT
+        if pretrained_weight:
+            checkpoint = torch.load(pretrained_weight, map_location='cpu')
+            print(f'\nload pretrain weight from {pretrained_weight} \n')
+            self.load_state_dict(checkpoint['model'], strict=False)
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if 'backbone' in name: #name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

GLEE/glee/backbone/eva02-dino.py ADDED Viewed

	@@ -0,0 +1,598 @@

+import logging
+import math
+from functools import partial
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+from detectron2.modeling.backbone import Backbone
+from .eva_02_utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+    VisionRotaryEmbeddingFast,
+)
+try:
+    import xformers.ops as xops
+    HAS_XFORMER=True
+except:
+    HAS_XFORMER=False
+    pass
+logger = logging.getLogger(__name__)
+__all__ = ["EVA02_ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
+                norm_layer=nn.LayerNorm, subln=False
+            ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            qk_scale=None,
+            attn_head_dim=None,
+            rope=None,
+            xattn=True,
+        ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.rope = rope
+        self.xattn = xattn
+        self.proj = nn.Linear(all_head_dim, dim)
+        if not HAS_XFORMER:
+            self.xattn = False
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.view(B, -1, C)
+        N = H * W
+        q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+        k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+        v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+        q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)     # B, num_heads, N, C
+        k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        ## rope
+        q = self.rope(q).type_as(v)
+        k = self.rope(k).type_as(v)
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = xops.memory_efficient_attention(q, k, v)
+            x = x.reshape(B, N, -1)
+        else:
+            q = q * self.scale
+            attn = (q @ k.transpose(-2, -1))
+            attn = attn.softmax(dim=-1).type_as(x)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = x.view(B, H, W, C)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_size=0,
+        use_residual_block=False,
+        rope=None,
+        xattn=True,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            rope=rope,
+            xattn=xattn,
+        )
+        from timm.models.layers import DropPath
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                subln=True,
+                norm_layer=norm_layer,
+            )
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+            )
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class EVA02_ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rope=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        xattn=True,
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        half_head_dim = embed_dim // num_heads // 2
+        hw_seq_len = img_size // patch_size
+        self.rope_win = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=window_size if intp_freq else None,
+        )
+        self.rope_glb = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=hw_seq_len if intp_freq else None,
+        )
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                rope=self.rope_win if i in window_block_indexes else self.rope_glb,
+                xattn=xattn
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+        self.scale_factors = scale_factors
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+        for stage in self.stages:
+            results.append(stage(features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

GLEE/glee/backbone/eva02.py ADDED Viewed

	@@ -0,0 +1,647 @@

+# --------------------------------------------------------
+# EVA02
+# --------------------------------------------------------
+import logging
+import math
+from functools import partial
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+from detectron2.modeling.backbone import Backbone
+from timm.models.layers import DropPath, Mlp, trunc_normal_
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .eva_02_utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+    VisionRotaryEmbeddingFast,
+)
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+try:
+    import xformers.ops as xops
+    HAS_XFORMER=True
+except:
+    HAS_XFORMER=False
+    pass
+try:
+    from apex.normalization import FusedLayerNorm
+except:
+    pass
+logger = logging.getLogger(__name__)
+__all__ = ["EVA02_ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
+                norm_layer=nn.LayerNorm, subln=False
+            ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            qk_scale=None,
+            attn_head_dim=None,
+            rope=None,
+            xattn=True,
+        ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.rope = rope
+        self.xattn = xattn
+        self.proj = nn.Linear(all_head_dim, dim)
+        if not HAS_XFORMER:
+            self.xattn = False
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.view(B, -1, C)
+        N = H * W
+        q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+        k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+        v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+        q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)     # B, num_heads, N, C
+        k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        ## rope
+        q = self.rope(q).type_as(v)
+        k = self.rope(k).type_as(v)
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = xops.memory_efficient_attention(q, k, v)
+            x = x.reshape(B, N, -1)
+        else:
+            q = q * self.scale
+            attn = (q @ k.transpose(-2, -1))
+            attn = attn.softmax(dim=-1).type_as(x)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = x.view(B, H, W, C)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_size=0,
+        use_residual_block=False,
+        rope=None,
+        xattn=True,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            rope=rope,
+            xattn=xattn,
+        )
+        from timm.models.layers import DropPath
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                subln=True,
+                norm_layer=norm_layer,
+            )
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+            )
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class EVA02_ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rope=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        xattn=True,
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        half_head_dim = embed_dim // num_heads // 2
+        hw_seq_len = img_size // patch_size
+        self.rope_win = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=window_size if intp_freq else None,
+        )
+        self.rope_glb = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=hw_seq_len if intp_freq else None,
+        )
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                rope=self.rope_win if i in window_block_indexes else self.rope_glb,
+                xattn=xattn
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+        self.scale_factors = scale_factors
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+        for stage in self.stages:
+            results.append(stage(features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+@BACKBONE_REGISTRY.register()
+class D2_EVA02(SimpleFeaturePyramid):
+    def __init__(self, cfg, input_shape):
+        super().__init__(
+            net = EVA02_ViT(
+                img_size= cfg.MODEL.EVA02.IMAGE_SIZE,
+                patch_size=cfg.MODEL.EVA02.PATCH_SIZE,
+                window_size= cfg.MODEL.EVA02.WINDOW_SIZE,
+                embed_dim= cfg.MODEL.EVA02.DMBED_DIM,
+                depth= cfg.MODEL.EVA02.DEPTH,
+                num_heads= cfg.MODEL.EVA02.NUM_HEADS ,
+                drop_path_rate= cfg.MODEL.EVA02.DROP_PATH_RATE,
+                mlp_ratio= cfg.MODEL.EVA02.MLP_RATIO,
+                # qkv_bias=True,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                window_block_indexes= cfg.MODEL.EVA02.WINDOW_BLOCK_INDEXES,
+                # residual_block_indexes=[],
+                # use_rel_pos=False,
+                use_act_checkpoint = cfg.MODEL.EVA02.CHECKPOINT,
+                out_feature="last_feat",
+                # intp_freq=True,
+            ),
+            in_feature = "last_feat",
+            out_channels=256,
+            scale_factors=(2.0, 1.0, 0.5),  # (4.0, 2.0, 1.0, 0.5) in ViTDet
+            top_block=LastLevelMaxPool(),
+            norm="LN",
+            square_pad=cfg.MODEL.EVA02.IMAGE_SIZE,
+        )
+        pretrained_weight = cfg.MODEL.EVA02.PRETRAINED_WEIGHT
+        if pretrained_weight:
+            checkpoint = torch.load(pretrained_weight, map_location='cpu')
+            print(f'\nload pretrain weight from {pretrained_weight} \n')
+            self.load_state_dict(checkpoint['model'], strict=False)
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

GLEE/glee/backbone/eva_01_utils.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import numpy as np
+from scipy import interpolate
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos, interp_type):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        if interp_type == "vitdet":
+            # the vitdet impl:
+            # https://github.com/facebookresearch/detectron2/blob/96c752ce821a3340e27edd51c28a00665dd32a30/detectron2/modeling/backbone/utils.py#L77.
+            rel_pos_resized = F.interpolate(
+                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+                size=max_rel_dist,
+                mode="linear",
+            )
+            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        elif interp_type == "beit":
+            # steal from beit https://github.com/microsoft/unilm/tree/master/beit
+            # modified by Yuxin Fang
+            src_size = rel_pos.shape[0]
+            dst_size = max_rel_dist
+            q = 1.0903078
+            dis = []
+            cur = 1
+            for i in range(src_size // 2):
+                dis.append(cur)
+                cur += q ** (i + 1)
+            r_ids = [-_ for _ in reversed(dis)]
+            x = r_ids + [0] + dis
+            t = dst_size // 2.0
+            dx = np.arange(-t, t + 0.1, 1.0)
+            all_rel_pos_bias = []
+            for i in range(rel_pos.shape[1]):
+                # a hack from https://github.com/baaivision/EVA/issues/8,
+                # could also be used in fine-tuning but the performance haven't been tested.
+                z = rel_pos[:, i].view(src_size).cpu().float().detach().numpy()
+                f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate")
+                all_rel_pos_bias.append(
+                    torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device))
+            rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1)
+        else:
+            raise NotImplementedError()
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size, interp_type):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h, interp_type)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w, interp_type)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

GLEE/glee/backbone/eva_02_utils.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import numpy as np
+from scipy import interpolate
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+    "VisionRotaryEmbeddingFast",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    use_log_interpolation = True
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        if not use_log_interpolation:
+            # Interpolate rel pos.
+            rel_pos_resized = F.interpolate(
+                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+                size=max_rel_dist,
+                mode="linear",
+            )
+            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        else:
+            src_size = rel_pos.shape[0]
+            dst_size = max_rel_dist
+            # q = 1.13492
+            q = 1.0903078
+            dis = []
+            cur = 1
+            for i in range(src_size // 2):
+                dis.append(cur)
+                cur += q ** (i + 1)
+            r_ids = [-_ for _ in reversed(dis)]
+            x = r_ids + [0] + dis
+            t = dst_size // 2.0
+            dx = np.arange(-t, t + 0.1, 1.0)
+            # print("x = %s" % str(x))
+            # print("dx = %s" % str(dx))
+            all_rel_pos_bias = []
+            for i in range(rel_pos.shape[1]):
+                z = rel_pos[:, i].view(src_size).cpu().float().numpy()
+                f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate")
+                all_rel_pos_bias.append(
+                    torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device))
+            rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+from math import pi
+import torch
+from torch import nn
+from einops import rearrange, repeat
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs_w = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t, start_index = 0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim = -1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len=16,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    # def forward(self, t): return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin
+    def forward(self, t):
+        if t.shape[2] != self.freqs_cos.shape[0]:
+            t_len = t.shape[2]
+            output = t * self.freqs_cos[:t_len] + rotate_half(t) * self.freqs_sin[:t_len]
+        else:
+            output = t * self.freqs_cos + rotate_half(t) * self.freqs_sin
+        return  output

GLEE/glee/backbone/internimage.py ADDED Viewed

	@@ -0,0 +1,737 @@

+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_, DropPath
+from detectron2.utils.logger import setup_logger
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .ops_dcnv3 import modules as opsm
+class to_channels_first(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+class to_channels_last(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+class CrossAttention(nn.Module):
+    r""" Cross Attention Module
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        attn_head_dim (int, optional): Dimension of attention head.
+        out_dim (int, optional): Dimension of output.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 attn_head_dim=None,
+                 out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = q.reshape(B, N, 1, self.num_heads,
+                      -1).permute(2, 0, 3, 1,
+                                  4).squeeze(0)  # (B, N_head, N_q, dim)
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                             4).squeeze(0)
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                             4).squeeze(0)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentiveBlock(nn.Module):
+    r"""Attentive Block
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop (float, optional): Dropout rate. Default: 0.0.
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0.
+        drop_path (float | tuple[float], optional): Stochastic depth rate.
+            Default: 0.0.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm.
+        attn_head_dim (int, optional): Dimension of attention head. Default: None.
+        out_dim (int, optional): Dimension of output. Default: None.
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer="LN",
+                 attn_head_dim=None,
+                 out_dim=None):
+        super().__init__()
+        self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6)
+        self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6)
+        self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6)
+        self.cross_dcn = CrossAttention(dim,
+                                        num_heads=num_heads,
+                                        qkv_bias=qkv_bias,
+                                        qk_scale=qk_scale,
+                                        attn_drop=attn_drop,
+                                        proj_drop=drop,
+                                        attn_head_dim=attn_head_dim,
+                                        out_dim=out_dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self,
+                x_q,
+                x_kv,
+                pos_q,
+                pos_k,
+                bool_masked_pos,
+                rel_pos_bias=None):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_dcn(x_q, k=x_k, v=x_v)
+        return x
+class AttentionPoolingBlock(AttentiveBlock):
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv = x
+        pos_q, pos_k = 0, 0
+        x = super().forward(x_q, x_kv, pos_q, pos_k,
+                            bool_masked_pos=None,
+                            rel_pos_bias=None)
+        x = x.squeeze(1)
+        return x
+class StemLayer(nn.Module):
+    r""" Stem layer of InternImage
+    Args:
+        in_chans (int): number of input channels
+        out_chans (int): number of output channels
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+    """
+    def __init__(self,
+                 in_chans=3,
+                 out_chans=96,
+                 act_layer='GELU',
+                 norm_layer='BN'):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_chans,
+                               out_chans // 2,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
+                                      'channels_first', 'channels_first')
+        self.act = build_act_layer(act_layer)
+        self.conv2 = nn.Conv2d(out_chans // 2,
+                               out_chans,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
+                                      'channels_last')
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        return x
+class DownsampleLayer(nn.Module):
+    r""" Downsample layer of InternImage
+    Args:
+        channels (int): number of input channels
+        norm_layer (str): normalization layer
+    """
+    def __init__(self, channels, norm_layer='LN'):
+        super().__init__()
+        self.conv = nn.Conv2d(channels,
+                              2 * channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=1,
+                              bias=False)
+        self.norm = build_norm_layer(2 * channels, norm_layer,
+                                     'channels_first', 'channels_last')
+    def forward(self, x):
+        x = self.conv(x.permute(0, 3, 1, 2))
+        x = self.norm(x)
+        return x
+class MLPLayer(nn.Module):
+    r""" MLP layer of InternImage
+    Args:
+        in_features (int): number of input features
+        hidden_features (int): number of hidden features
+        out_features (int): number of output features
+        act_layer (str): activation layer
+        drop (float): dropout rate
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='GELU',
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = build_act_layer(act_layer)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class InternImageLayer(nn.Module):
+    r""" Basic layer of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+    def __init__(self,
+                 core_op,
+                 channels,
+                 groups,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 with_cp=False,
+                 dw_kernel_size=None, # for InternImage-H/G
+                 res_post_norm=False, # for InternImage-H/G
+                 center_feature_scale=False): # for InternImage-H/G
+        super().__init__()
+        self.channels = channels
+        self.groups = groups
+        self.mlp_ratio = mlp_ratio
+        self.with_cp = with_cp
+        self.norm1 = build_norm_layer(channels, 'LN')
+        self.post_norm = post_norm
+        self.dcn = core_op(
+            channels=channels,
+            kernel_size=3,
+            stride=1,
+            pad=1,
+            dilation=1,
+            group=groups,
+            offset_scale=offset_scale,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            dw_kernel_size=dw_kernel_size, # for InternImage-H/G
+            center_feature_scale=center_feature_scale) # for InternImage-H/G
+        self.drop_path = DropPath(drop_path) if drop_path > 0. \
+            else nn.Identity()
+        self.norm2 = build_norm_layer(channels, 'LN')
+        self.mlp = MLPLayer(in_features=channels,
+                            hidden_features=int(channels * mlp_ratio),
+                            act_layer=act_layer,
+                            drop=drop)
+        self.layer_scale = layer_scale is not None
+        if self.layer_scale:
+            self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+            self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+        self.res_post_norm = res_post_norm
+        if res_post_norm:
+            self.res_post_norm1 = build_norm_layer(channels, 'LN')
+            self.res_post_norm2 = build_norm_layer(channels, 'LN')
+    def forward(self, x):
+        def _inner_forward(x):
+            if not self.layer_scale:
+                if self.post_norm:
+                    x = x + self.drop_path(self.norm1(self.dcn(x)))
+                    x = x + self.drop_path(self.norm2(self.mlp(x)))
+                elif self.res_post_norm: # for InternImage-H/G
+                    x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x))))
+                    x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x))))
+                else:
+                    x = x + self.drop_path(self.dcn(self.norm1(x)))
+                    x = x + self.drop_path(self.mlp(self.norm2(x)))
+                return x
+            if self.post_norm:
+                x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
+                x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
+                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+            return x
+        if self.with_cp and x.requires_grad:
+            x = checkpoint.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+class InternImageBlock(nn.Module):
+    r""" Block of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        depths (list): Depth of each block.
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+    def __init__(self,
+                 core_op,
+                 channels,
+                 depth,
+                 groups,
+                 downsample=True,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 offset_scale=1.0,
+                 layer_scale=None,
+                 with_cp=False,
+                 dw_kernel_size=None, # for InternImage-H/G
+                 post_norm_block_ids=None, # for InternImage-H/G
+                 res_post_norm=False, # for InternImage-H/G
+                 center_feature_scale=False): # for InternImage-H/G
+        super().__init__()
+        self.channels = channels
+        self.depth = depth
+        self.post_norm = post_norm
+        self.center_feature_scale = center_feature_scale
+        self.blocks = nn.ModuleList([
+            InternImageLayer(
+                core_op=core_op,
+                channels=channels,
+                groups=groups,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i] if isinstance(
+                    drop_path, list) else drop_path,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                post_norm=post_norm,
+                layer_scale=layer_scale,
+                offset_scale=offset_scale,
+                with_cp=with_cp,
+                dw_kernel_size=dw_kernel_size, # for InternImage-H/G
+                res_post_norm=res_post_norm, # for InternImage-H/G
+                center_feature_scale=center_feature_scale # for InternImage-H/G
+            ) for i in range(depth)
+        ])
+        if not self.post_norm or center_feature_scale:
+            self.norm = build_norm_layer(channels, 'LN')
+        self.post_norm_block_ids = post_norm_block_ids
+        if post_norm_block_ids is not None: # for InternImage-H/G
+            self.post_norms = nn.ModuleList(
+                [build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids]
+            )
+        self.downsample = DownsampleLayer(
+            channels=channels, norm_layer=norm_layer) if downsample else None
+    def forward(self, x, return_wo_downsample=False):
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids):
+                index = self.post_norm_block_ids.index(i)
+                x = self.post_norms[index](x) # for InternImage-H/G
+        if not self.post_norm or self.center_feature_scale:
+            x = self.norm(x)
+        if return_wo_downsample:
+            x_ = x
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if return_wo_downsample:
+            return x, x_
+        return x
+class InternImage(Backbone):
+    r""" InternImage
+        A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        core_op (str): Core operator. Default: 'DCNv3'
+        channels (int): Number of the first stage. Default: 64
+        depths (list): Depth of each block. Default: [3, 4, 18, 5]
+        groups (list): Groups of each block. Default: [3, 6, 12, 24]
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Probability of an element to be zeroed. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        act_layer (str): Activation layer. Default: 'GELU'
+        norm_layer (str): Normalization layer. Default: 'LN'
+        layer_scale (bool): Whether to use layer scale. Default: False
+        cls_scale (bool): Whether to use class scale. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+        dw_kernel_size (int): Size of the dwconv. Default: None
+        level2_post_norm (bool): Whether to use level2 post norm. Default: False
+        level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
+        res_post_norm (bool): Whether to use res post norm. Default: False
+        center_feature_scale (bool): Whether to use center feature scale. Default: False
+    """
+    def __init__(self,
+                 core_op='DCNv3',
+                 channels=64,
+                 depths=[3, 4, 18, 5],
+                 groups=[3, 6, 12, 24],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.2,
+                 drop_path_type='linear',
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 post_norm=False,
+                 with_cp=False,
+                 dw_kernel_size=None,  # for InternImage-H/G
+                 level2_post_norm=False,  # for InternImage-H/G
+                 level2_post_norm_block_ids=None,  # for InternImage-H/G
+                 res_post_norm=False,  # for InternImage-H/G
+                 center_feature_scale=False,  # for InternImage-H/G
+                 out_indices=(0, 1, 2, 3),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__()
+        self.core_op = core_op
+        self.num_levels = len(depths)
+        self.depths = depths
+        self.channels = channels
+        self.num_features = int(channels * 2**(self.num_levels - 1))
+        self.post_norm = post_norm
+        self.mlp_ratio = mlp_ratio
+        self.init_cfg = init_cfg
+        self.out_indices = out_indices
+        self.level2_post_norm_block_ids = level2_post_norm_block_ids
+        logger = setup_logger(name="InternImage")
+        logger.info(f'using core type: {core_op}')
+        logger.info(f'using activation layer: {act_layer}')
+        logger.info(f'using main norm layer: {norm_layer}')
+        logger.info(f'using dpr: {drop_path_type}, {drop_path_rate}')
+        logger.info(f"level2_post_norm: {level2_post_norm}")
+        logger.info(f"level2_post_norm_block_ids: {level2_post_norm_block_ids}")
+        logger.info(f"res_post_norm: {res_post_norm}")
+        in_chans = 3
+        self.patch_embed = StemLayer(in_chans=in_chans,
+                                     out_chans=channels,
+                                     act_layer=act_layer,
+                                     norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        if drop_path_type == 'uniform':
+            for i in range(len(dpr)):
+                dpr[i] = drop_path_rate
+        self.levels = nn.ModuleList()
+        for i in range(self.num_levels):
+            post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and (
+                i == 2) else None # for InternImage-H/G
+            level = InternImageBlock(
+                core_op=getattr(opsm, core_op),
+                channels=int(channels * 2**i),
+                depth=depths[i],
+                groups=groups[i],
+                mlp_ratio=self.mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                post_norm=post_norm,
+                downsample=(i < self.num_levels - 1),
+                layer_scale=layer_scale,
+                offset_scale=offset_scale,
+                with_cp=with_cp,
+                dw_kernel_size=dw_kernel_size,  # for InternImage-H/G
+                post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G
+                res_post_norm=res_post_norm, # for InternImage-H/G
+                center_feature_scale=center_feature_scale # for InternImage-H/G
+            )
+            self.levels.append(level)
+        self.num_layers = len(depths)
+        self.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        # add basic info for d2 backbone
+        self._out_features = ["res{}".format(i+2) for i in self.out_indices]
+        self._out_feature_channels = {
+            "res{}".format(i+2): self.channels * 2**i for i in self.out_indices
+        }
+        self._out_feature_strides = {"res{}".format(i+2): 2 ** (i + 2) for i in self.out_indices}
+        self._size_devisibility = 32
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _init_deform_weights(self, m):
+        if isinstance(m, getattr(opsm, self.core_op)):
+            m._reset_parameters()
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+        # d2 need dict output
+        # seq_out = []
+        seq_out = {}
+        for level_idx, level in enumerate(self.levels):
+            x, x_ = level(x, return_wo_downsample=True)
+            if level_idx in self.out_indices:
+                # seq_out.append(x_.permute(0, 3, 1, 2).contiguous())
+                seq_out["res{}".format(level_idx+2)] = x_.permute(0, 3, 1, 2).contiguous()
+        return seq_out
+@BACKBONE_REGISTRY.register()
+class D2InternImage(InternImage):
+    def __init__(self, cfg, input_shape):
+        super().__init__(
+            core_op= cfg.MODEL.INTERNIMAGE.CORE_OP ,
+            channels=cfg.MODEL.INTERNIMAGE.CHANNELS,
+            depths=cfg.MODEL.INTERNIMAGE.DEPTHS,
+            groups=cfg.MODEL.INTERNIMAGE.GROUPS,
+            mlp_ratio= cfg.MODEL.INTERNIMAGE.MLP_RATIO ,
+            drop_path_rate=cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE,
+            norm_layer=cfg.MODEL.INTERNIMAGE.NORM_LAYER,
+            layer_scale=cfg.MODEL.INTERNIMAGE.LAYER_SCALE ,
+            offset_scale=cfg.MODEL.INTERNIMAGE.OFFSET_SCALE,
+            post_norm=cfg.MODEL.INTERNIMAGE.POST_NORM,
+            with_cp=cfg.MODEL.INTERNIMAGE.WITH_CP ,
+            out_indices=cfg.MODEL.INTERNIMAGE.OUT_IINDICES,
+            dw_kernel_size= cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE, # for InternImage-H/G
+            res_post_norm= cfg.MODEL.INTERNIMAGE.RES_POST_NORM, # for InternImage-H/G
+            level2_post_norm= cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM, # for InternImage-H/G
+            level2_post_norm_block_ids= cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G
+            center_feature_scale= cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE, # for InternImage-H/G
+        )
+        pretrained_weight = cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT
+        if pretrained_weight:
+            checkpoint = torch.load(pretrained_weight, map_location='cpu')
+            print(f'\nload pretrain weight from {pretrained_weight} \n')
+            self.load_state_dict(checkpoint['model'], strict=False)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

GLEE/glee/backbone/registry.py ADDED Viewed

	@@ -0,0 +1,14 @@

+_model_entrypoints = {}
+def register_backbone(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints

GLEE/glee/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,731 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import pickle
+import numpy as np
+from typing import Any, Dict
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .backbone import Backbone
+from .registry import register_backbone
+from detectron2.layers import (
+    CNNBlockBase,
+    Conv2d,
+    DeformConv,
+    ModulatedDeformConv,
+    ShapeSpec,
+    get_norm,
+)
+from detectron2.utils.file_io import PathManager
+__all__ = [
+    "ResNetBlockBase",
+    "BasicBlock",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "get_resnet_backbone",
+]
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            stride (int): Stride for the first conv.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class DeformBottleneckBlock(CNNBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+class ResNet(Backbone):
+    """
+    Implement :paper:`ResNet`.
+    """
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stage_names, self.stages = [], []
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+ResNetBlockBase = CNNBlockBase
+"""
+Alias for backward compatibiltiy.
+"""
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+def _convert_ndarray_to_tensor(state_dict: Dict[str, Any]) -> None:
+    """
+    In-place convert all numpy arrays in the state_dict to torch tensor.
+    Args:
+        state_dict (dict): a state-dict to be loaded to the model.
+            Will be modified.
+    """
+    # model could be an OrderedDict with _metadata attribute
+    # (as returned by Pytorch's state_dict()). We should preserve these
+    # properties.
+    for k in list(state_dict.keys()):
+        v = state_dict[k]
+        if not isinstance(v, np.ndarray) and not isinstance(v, torch.Tensor):
+            raise ValueError(
+                "Unsupported type found in checkpoint! {}: {}".format(k, type(v))
+            )
+        if not isinstance(v, torch.Tensor):
+            state_dict[k] = torch.from_numpy(v)
+@register_backbone
+def get_resnet_backbone(cfg):
+    """
+    Create a ResNet instance from config.
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    res_cfg = cfg['MODEL']['BACKBONE']['RESNETS']
+    # need registration of new blocks/stems?
+    norm = res_cfg['NORM']
+    stem = BasicStem(
+        in_channels=res_cfg['STEM_IN_CHANNELS'],
+        out_channels=res_cfg['STEM_OUT_CHANNELS'],
+        norm=norm,
+    )
+    # fmt: off
+    freeze_at           = res_cfg['FREEZE_AT']
+    out_features        = res_cfg['OUT_FEATURES']
+    depth               = res_cfg['DEPTH']
+    num_groups          = res_cfg['NUM_GROUPS']
+    width_per_group     = res_cfg['WIDTH_PER_GROUP']
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = res_cfg['STEM_OUT_CHANNELS']
+    out_channels        = res_cfg['RES2_OUT_CHANNELS']
+    stride_in_1x1       = res_cfg['STRIDE_IN_1X1']
+    res5_dilation       = res_cfg['RES5_DILATION']
+    deform_on_per_stage = res_cfg['DEFORM_ON_PER_STAGE']
+    deform_modulated    = res_cfg['DEFORM_MODULATED']
+    deform_num_groups   = res_cfg['DEFORM_NUM_GROUPS']
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(
+            deform_on_per_stage
+        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+    stages = []
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    backbone = ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
+    if cfg['MODEL']['BACKBONE']['LOAD_PRETRAINED'] is True:
+        filename = cfg['MODEL']['BACKBONE']['PRETRAINED']
+        with PathManager.open(filename, "rb") as f:
+            ckpt = pickle.load(f, encoding="latin1")['model']
+        _convert_ndarray_to_tensor(ckpt)
+        ckpt.pop('stem.fc.weight')
+        ckpt.pop('stem.fc.bias')
+        backbone.load_state_dict(ckpt)
+    return backbone

GLEE/glee/backbone/swin.py ADDED Viewed

	@@ -0,0 +1,783 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .registry import register_backbone
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            checkpoint = torch.load(pretrained, map_location='cpu')
+            print(f'\nload pretrain weight from {pretrained} \n')
+            self.load_state_dict(checkpoint['model'], strict=False)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["res{}".format(i + 2)] = out
+        return outs
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+@BACKBONE_REGISTRY.register()
+class D2SwinTransformer(SwinTransformer, Backbone):
+    def __init__(self, cfg, input_shape):
+        pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
+        patch_size = cfg.MODEL.SWIN.PATCH_SIZE
+        in_chans = 3
+        embed_dim = cfg.MODEL.SWIN.EMBED_DIM
+        depths = cfg.MODEL.SWIN.DEPTHS
+        num_heads = cfg.MODEL.SWIN.NUM_HEADS
+        window_size = cfg.MODEL.SWIN.WINDOW_SIZE
+        mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
+        qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
+        qk_scale = cfg.MODEL.SWIN.QK_SCALE
+        drop_rate = cfg.MODEL.SWIN.DROP_RATE
+        attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
+        drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
+        norm_layer = nn.LayerNorm
+        ape = cfg.MODEL.SWIN.APE
+        patch_norm = cfg.MODEL.SWIN.PATCH_NORM
+        use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT
+        pretrained_weight = cfg.MODEL.SWIN.PRETRAINED_WEIGHT
+        super().__init__(
+            pretrain_img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            depths,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            qk_scale,
+            drop_rate,
+            attn_drop_rate,
+            drop_path_rate,
+            norm_layer,
+            ape,
+            patch_norm,
+            use_checkpoint=use_checkpoint,
+        )
+        self.init_weights(pretrained_weight)
+        self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

GLEE/glee/backbone/vit.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import logging
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+import torch.nn.functional as F
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+from functools import partial
+import torch.utils.checkpoint as checkpoint
+logger = logging.getLogger(__name__)
+__all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=True, enable_math=False, enable_mem_efficient=True
+        ):
+            x = F.scaled_dot_product_attention(q, k, v)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+        from timm.models.layers import DropPath, Mlp
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+                act_layer=act_layer,
+            )
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        # In our method, we don't use backbone feature with stride 4
+        self.fpn1 = nn.Sequential(
+            nn.ConvTranspose2d(embed_dim, embed_dim // 2, kernel_size=2, stride=2),
+        )
+        self.fpn2 = nn.Identity()
+        self.fpn3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        xp = x.permute(0, 3, 1, 2) # (b, h, w, c) --> (b, c, h, w)
+        features = []
+        ops = [self.fpn1, self.fpn2, self.fpn3]
+        for i in range(len(ops)):
+            features.append(ops[i](xp))
+        rets = {"res{}".format(u + 3): v for (u,v) in enumerate(features)}
+        return rets
+@BACKBONE_REGISTRY.register()
+class D2ViT(ViT, Backbone):
+    def __init__(self, cfg, input_shape):
+        use_checkpoint = cfg.MODEL.VIT.USE_CHECKPOINT
+        if cfg.MODEL.VIT.NAME == "ViT-Base":
+            embed_dim=768
+            depth=12
+            drop_path_rate=0.1
+            num_heads=12
+        elif cfg.MODEL.VIT.NAME == "ViT-Large":
+            embed_dim=1024
+            depth=24
+            drop_path_rate=0.4
+            num_heads=16
+        elif cfg.MODEL.VIT.NAME == "ViT-huge":
+            embed_dim=1280
+            depth=32
+            drop_path_rate=0.5
+            num_heads=16
+        else:
+            raise ValueError("Unsupported ViT name")
+        super().__init__(
+            img_size=1024,
+            patch_size=16,
+            in_chans=input_shape.channels,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            drop_path_rate=drop_path_rate,
+            window_size=14,
+            mlp_ratio=4,
+            qkv_bias=True,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            window_block_indexes=[
+                # 2, 5, 8 11 for global attention
+                0,
+                1,
+                3,
+                4,
+                6,
+                7,
+                9,
+                10,
+            ],
+            residual_block_indexes=[],
+            use_rel_pos=True,
+            out_feature="last_feat",
+            use_act_checkpoint=use_checkpoint)
+        self._out_features = cfg.MODEL.VIT.OUT_FEATURES
+        self._out_feature_strides = {
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res3": embed_dim // 2,
+            "res4": embed_dim,
+            "res5": embed_dim,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

GLEE/glee/backbone/vit_utils.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import numpy as np
+from scipy import interpolate
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos, interp_type):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        if interp_type == "vitdet":
+            # the vitdet impl:
+            # https://github.com/facebookresearch/detectron2/blob/96c752ce821a3340e27edd51c28a00665dd32a30/detectron2/modeling/backbone/utils.py#L77.
+            rel_pos_resized = F.interpolate(
+                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+                size=max_rel_dist,
+                mode="linear",
+            )
+            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        elif interp_type == "beit":
+            # steal from beit https://github.com/microsoft/unilm/tree/master/beit
+            # modified by Yuxin Fang
+            src_size = rel_pos.shape[0]
+            dst_size = max_rel_dist
+            q = 1.0903078
+            dis = []
+            cur = 1
+            for i in range(src_size // 2):
+                dis.append(cur)
+                cur += q ** (i + 1)
+            r_ids = [-_ for _ in reversed(dis)]
+            x = r_ids + [0] + dis
+            t = dst_size // 2.0
+            dx = np.arange(-t, t + 0.1, 1.0)
+            all_rel_pos_bias = []
+            for i in range(rel_pos.shape[1]):
+                # a hack from https://github.com/baaivision/EVA/issues/8,
+                # could also be used in fine-tuning but the performance haven't been tested.
+                z = rel_pos[:, i].view(src_size).cpu().float().detach().numpy()
+                f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate")
+                all_rel_pos_bias.append(
+                    torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device))
+            rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1)
+        else:
+            raise NotImplementedError()
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size, interp_type):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h, interp_type)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w, interp_type)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

GLEE/glee/config.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# -*- coding: utf-8 -*-
+from detectron2.config import CfgNode as CN
+def add_glee_config(cfg):
+    """
+    Add config for DETR.
+    """
+    cfg.FIND_UNUSED_PARAMETERS = True
+    cfg.MODEL.MAX_CATEGORY_LEN = 100
+    cfg.MODEL.PSEUDO_VIDEO = False
+    cfg.MODEL.FREEZE_WHOLE = False
+    cfg.MODEL.CONTRAS_MEAN = False
+    cfg.MODEL.CROSS_TRACK = False
+    cfg.MODEL.TRACK_VERSION = 'v3'
+    cfg.INPUT.SAMPLING_FRAME_NUM = 1
+    cfg.INPUT.SAMPLING_FRAME_RANGE = 10
+    cfg.INPUT.SAMPLING_INTERVAL = 1
+    cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
+    cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
+    cfg.INPUT.DATASET_MAPPER_NAME = None
+    cfg.DATALOADER.DATASET_RATIO = [1, 1]
+    cfg.DATALOADER.USE_DIFF_BS_SIZE = True
+    cfg.DATALOADER.DATASET_BS = [2, 2]
+    cfg.DATALOADER.DATASET_FILTERS = [True, True]
+    cfg.DATALOADER.USE_RFS = [False, False]
+    cfg.DATALOADER.MULTI_DATASET_GROUPING = True
+    cfg.DATALOADER.DATASET_ANN = ['image']
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    cfg.DATALOADER.DATASET_RATIO = [1, 1]
+    cfg.DATALOADER.USE_DIFF_BS_SIZE = True
+    cfg.DATALOADER.DATASET_BS = [2, 2]
+    cfg.DATALOADER.USE_RFS = [False, False]
+    cfg.DATALOADER.MULTI_DATASET_GROUPING = True
+    cfg.DATALOADER.DATASET_ANN = ['box', 'box']
+    # Allow different datasets to use different input resolutions
+    cfg.INPUT.MIN_SIZE_TRAIN_MULTI = [(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), (320, 352, 392, 416, 448, 480, 512, 544, 576, 608, 640)]
+    cfg.INPUT.MAX_SIZE_TRAIN_MULTI = [1333, 768]
+    # MaskDINO model config
+    cfg.MODEL.MaskDINO = CN()
+    cfg.MODEL.MaskDINO.LEARN_TGT = False
+    # loss
+    cfg.MODEL.MaskDINO.PANO_BOX_LOSS = False
+    cfg.MODEL.MaskDINO.SEMANTIC_CE_LOSS = False
+    cfg.MODEL.MaskDINO.DEEP_SUPERVISION = True
+    cfg.MODEL.MaskDINO.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.MaskDINO.CLASS_WEIGHT = 4.0
+    cfg.MODEL.MaskDINO.DICE_WEIGHT = 5.0
+    cfg.MODEL.MaskDINO.MASK_WEIGHT = 5.0
+    cfg.MODEL.MaskDINO.BOX_WEIGHT = 5.
+    cfg.MODEL.MaskDINO.GIOU_WEIGHT = 2.
+    # cost weight
+    cfg.MODEL.MaskDINO.COST_CLASS_WEIGHT = 4.0
+    cfg.MODEL.MaskDINO.COST_DICE_WEIGHT = 5.0
+    cfg.MODEL.MaskDINO.COST_MASK_WEIGHT = 5.0
+    cfg.MODEL.MaskDINO.COST_BOX_WEIGHT = 5.
+    cfg.MODEL.MaskDINO.COST_GIOU_WEIGHT = 2.
+    # transformer config
+    cfg.MODEL.MaskDINO.NHEADS = 8
+    cfg.MODEL.MaskDINO.DROPOUT = 0.1
+    cfg.MODEL.MaskDINO.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.MaskDINO.ENC_LAYERS = 0
+    cfg.MODEL.MaskDINO.DEC_LAYERS = 6
+    cfg.MODEL.MaskDINO.INITIAL_PRED = True
+    cfg.MODEL.MaskDINO.PRE_NORM = False
+    cfg.MODEL.MaskDINO.BOX_LOSS = True
+    cfg.MODEL.MaskDINO.HIDDEN_DIM = 256
+    cfg.MODEL.MaskDINO.NUM_OBJECT_QUERIES = 100
+    cfg.MODEL.MaskDINO.ENFORCE_INPUT_PROJ = False
+    cfg.MODEL.MaskDINO.TWO_STAGE = True
+    cfg.MODEL.MaskDINO.INITIALIZE_BOX_TYPE = 'no'  # ['no', 'bitmask', 'mask2box']
+    cfg.MODEL.MaskDINO.DN="seg"
+    cfg.MODEL.MaskDINO.DN_NOISE_SCALE=0.4
+    cfg.MODEL.MaskDINO.DN_NUM=100
+    cfg.MODEL.MaskDINO.PRED_CONV=False
+    cfg.MODEL.MaskDINO.EVAL_FLAG = 1
+    # MSDeformAttn encoder configs
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
+    cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS = 3
+    cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS = 4
+    cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER = 'high2low'  # ['low2high', 'high2low'] high2low: from high level to low level
+    #####################
+    # MaskDINO inference config
+    cfg.MODEL.MaskDINO.TEST = CN()
+    cfg.MODEL.MaskDINO.TEST.TEST_FOUCUS_ON_BOX = False
+    cfg.MODEL.MaskDINO.TEST.SEMANTIC_ON = True
+    cfg.MODEL.MaskDINO.TEST.INSTANCE_ON = False
+    cfg.MODEL.MaskDINO.TEST.PANOPTIC_ON = False
+    cfg.MODEL.MaskDINO.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.MaskDINO.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.MaskDINO.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    cfg.MODEL.MaskDINO.TEST.PANO_TRANSFORM_EVAL = True
+    cfg.MODEL.MaskDINO.TEST.PANO_TEMPERATURE = 0.06
+    # cfg.MODEL.MaskDINO.TEST.EVAL_FLAG = 1
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.MaskDINO.SIZE_DIVISIBILITY = 32
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MaskDINOEncoder"
+    # transformer module
+    cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME = "MaskDINODecoder"
+    # LSJ aug
+    cfg.INPUT.IMAGE_SIZE = 1024
+    cfg.INPUT.MIN_SCALE = 0.1
+    cfg.INPUT.MAX_SCALE = 2.0
+    # point loss configs
+    # Number of points sampled during training for a mask point head.
+    cfg.MODEL.MaskDINO.TRAIN_NUM_POINTS = 112 * 112
+    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
+    # original paper.
+    cfg.MODEL.MaskDINO.OVERSAMPLE_RATIO = 3.0
+    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
+    # the original paper.
+    cfg.MODEL.MaskDINO.IMPORTANCE_SAMPLE_RATIO = 0.75
+    cfg.MODEL.DIM_PROJ = 256
+    cfg.MODEL.VISUAL_PROMPT = False
+    cfg.MODEL.TEXT = CN()
+    cfg.MODEL.TEXT.ARCH = 'vlpencoder'
+    cfg.MODEL.TEXT.NAME= 'transformer'
+    cfg.MODEL.TEXT.TOKENIZER= 'clip'
+    cfg.MODEL.TEXT.CONTEXT_LENGTH= 77 # 77
+    cfg.MODEL.TEXT.WIDTH= 512
+    cfg.MODEL.TEXT.HEADS= 8
+    cfg.MODEL.TEXT.LAYERS= 12 # 6
+    cfg.MODEL.TEXT.AUTOGRESSIVE= True
+    cfg.MODEL.LANGUAGE_BACKBONE = CN()
+    cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
+    cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
+    cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
+    cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
+    cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 77 # max length of the tokenized captions.
+    cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
+    # cfg.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
+    # cfg.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
+    cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
+    cfg.MODEL.ENCODER = CN()
+    cfg.MODEL.ENCODER.NAME= 'transformer_encoder_fpn'
+    cfg.MODEL.ENCODER.IGNORE_VALUE= 255
+    cfg.MODEL.ENCODER.NUM_CLASSES= 133
+    cfg.MODEL.ENCODER.LOSS_WEIGHT= 1.0
+    cfg.MODEL.ENCODER.CONVS_DIM= 512
+    cfg.MODEL.ENCODER.MASK_DIM= 512
+    cfg.MODEL.ENCODER.NORM= "GN"
+    cfg.MODEL.ENCODER.IN_FEATURES= ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.ENCODER.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES= ["res3", "res4", "res5"]
+    cfg.MODEL.ENCODER.COMMON_STRIDE= 4
+    cfg.MODEL.ENCODER.TRANSFORMER_ENC_LAYERS= 6
+    cfg.MODEL.DECODER = CN()
+    cfg.MODEL.DECODER.TRANSFORMER_IN_FEATURE= "multi_scale_pixel_decoder"
+    cfg.MODEL.DECODER.MASK  = True
+    # DETECTION= False
+    # SPATIAL=
+    #   ENABLED= True
+    # GROUNDING=
+    #   ENABLED= False
+    #   MAX_LEN= 5
+    #   TEXT_WEIGHT= 2.0
+    #   CLASS_WEIGHT= 0.5
+    # VISUAL=
+    #   ENABLED= False
+    # AUDIO=
+    #   ENABLED= False
+    # OPENIMAGE=
+    #   ENABLED= False
+    #   NEGATIVE_SAMPLES= 5
+    #   GROUNDING=
+    #     ENABLED= False
+    #     MAX_LEN= 5
+    # CAPTION=
+    #   ENABLED= False
+    #   PHRASE_PROB= 0.5
+    #   SIM_THRES= 0.95
+    cfg.MODEL.DECODER.HIDDEN_DIM= 512
+    cfg.MODEL.DECODER.NUM_OBJECT_QUERIES= 101
+    cfg.MODEL.DECODER.NHEADS= 8
+    cfg.MODEL.DECODER.DROPOUT= 0.0
+    cfg.MODEL.DECODER.DIM_FEEDFORWARD= 2048
+    cfg.MODEL.DECODER.MAX_SPATIAL_LEN= [512, 512, 512, 512]
+    cfg.MODEL.DECODER.PRE_NORM= False
+    cfg.MODEL.DECODER.ENFORCE_INPUT_PROJ= False
+    cfg.MODEL.DECODER.SIZE_DIVISIBILITY= 32
+    cfg.MODEL.DECODER.TRAIN_NUM_POINTS= 12544
+    cfg.MODEL.DECODER.OVERSAMPLE_RATIO= 3.0
+    cfg.MODEL.DECODER.IMPORTANCE_SAMPLE_RATIO= 0.75
+    cfg.MODEL.DECODER.DEC_LAYERS= 10  # 9 decoder layers, add one for the loss on learnable query
+    cfg.MODEL.DECODER.TOP_GROUNDING_LAYERS= 10
+    cfg.MODEL.DECODER.TOP_CAPTION_LAYERS= 10
+    cfg.MODEL.DECODER.TOP_SPATIAL_LAYERS= 10
+    cfg.MODEL.DECODER.TOP_OPENIMAGE_LAYERS= 10
+    # TEST=
+    #   SEMANTIC_ON= True
+    #   INSTANCE_ON= True
+    #   PANOPTIC_ON= True
+    #   OVERLAP_THRESHOLD= 0.8
+    #   OBJECT_MASK_THRESHOLD= 0.4
+    #   SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE= false
+    #   DETECTIONS_PER_IMAGE= 100
+    cfg.ATTENTION_ARCH = CN()
+    # cfg.ATTENTION_ARCH.VARIABLE={
+    # 'queries': ['object'],
+    # 'tokens': ['grounding', 'spatial', 'visual', 'audio']}
+#   SELF_ATTENTION:
+#     queries:
+#       object: ['queries_object', 'tokens_grounding', 'tokens_spatial', 'tokens_visual', 'tokens_audio']
+#     tokens:
+#       grounding: ['queries_object', 'tokens_grounding']
+#       spatial: ['tokens_spatial']
+#       visual: ['tokens_visual']
+#       audio: ['queries_object', 'tokens_audio']
+#   CROSS_ATTENTION:
+#     queries:
+#       object: True
+#     tokens:
+#       grounding: False
+#       spatial: False
+#       visual: False
+#       audio: False
+#   MASKING: ['tokens_spatial', 'tokens_grounding', 'tokens_visual', 'tokens_audio']
+#   DUPLICATION:
+#     queries:
+#       grounding: 'queries_object'
+#       spatial: 'queries_object'
+#   SPATIAL_MEMORIES: 32
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    cfg.SOLVER.TEXTENCODER_MULTIPLIER = 1.0
+    cfg.SOLVER.LR_DECAY_RATE = None
+    cfg.SOLVER.LR_DECAY_RATE_NUM_LAYERS = None
+    ## support Swin backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.SWIN.USE_CHECKPOINT = False
+    cfg.MODEL.SWIN.PRETRAINED_WEIGHT = None
+    # support InterImage backbone
+    cfg.MODEL.INTERNIMAGE = CN()  # large as base
+    #### large
+    cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT = None
+    cfg.MODEL.INTERNIMAGE.CORE_OP = "DCNv3"
+    cfg.MODEL.INTERNIMAGE.CHANNELS = 160
+    cfg.MODEL.INTERNIMAGE.DEPTHS = [5, 5, 22, 5]
+    cfg.MODEL.INTERNIMAGE.GROUPS =[10, 20, 40, 80]
+    cfg.MODEL.INTERNIMAGE.MLP_RATIO =4.
+    cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE =0.0
+    cfg.MODEL.INTERNIMAGE.NORM_LAYER = "LN"
+    cfg.MODEL.INTERNIMAGE.LAYER_SCALE = 1.0
+    cfg.MODEL.INTERNIMAGE.OFFSET_SCALE = 2.0
+    cfg.MODEL.INTERNIMAGE.POST_NORM = True
+    cfg.MODEL.INTERNIMAGE.WITH_CP = False
+    cfg.MODEL.INTERNIMAGE.OUT_IINDICES = (0, 1, 2, 3)
+    cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE = None
+    cfg.MODEL.INTERNIMAGE.RES_POST_NORM = False
+    cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM = False
+    cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS = None
+    cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE = False
+    ### huge
+    # cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT = None
+    # cfg.MODEL.INTERNIMAGE.CORE_OP = "DCNv3"
+    # cfg.MODEL.INTERNIMAGE.CHANNELS = 320
+    # cfg.MODEL.INTERNIMAGE.DEPTHS = [6, 6, 32, 6]
+    # cfg.MODEL.INTERNIMAGE.GROUPS = [10, 20, 40, 80]
+    # cfg.MODEL.INTERNIMAGE.MLP_RATIO =4.
+    # cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE = 0.5
+    # cfg.MODEL.INTERNIMAGE.NORM_LAYER = "LN"
+    # cfg.MODEL.INTERNIMAGE.LAYER_SCALE = None
+    # cfg.MODEL.INTERNIMAGE.OFFSET_SCALE = 1.0
+    # cfg.MODEL.INTERNIMAGE.POST_NORM = False
+    # cfg.MODEL.INTERNIMAGE.WITH_CP = False
+    # cfg.MODEL.INTERNIMAGE.OUT_IINDICES = (0, 1, 2, 3)
+    # cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE = 5
+    # cfg.MODEL.INTERNIMAGE.RES_POST_NORM = True
+    # cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM = True
+    # cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS = [5, 11, 17, 23, 29]
+    # cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE = True
+    # support EVA02 backbone
+    cfg.MODEL.EVA02 = CN()  # large as base
+    #### large
+    cfg.MODEL.EVA02.PRETRAINED_WEIGHT = None
+    cfg.MODEL.EVA02.IMAGE_SIZE =  1536
+    cfg.MODEL.EVA02.PATCH_SIZE =  16
+    cfg.MODEL.EVA02.WINDOW_SIZE =  16
+    cfg.MODEL.EVA02.DMBED_DIM =1024
+    cfg.MODEL.EVA02.DEPTH =  24
+    cfg.MODEL.EVA02.NUM_HEADS =  16
+    cfg.MODEL.EVA02.MLP_RATIO =   4*2/3
+    cfg.MODEL.EVA02.DROP_PATH_RATE =  0.3
+    cfg.MODEL.EVA02.CHECKPOINT = True
+    cfg.MODEL.EVA02.WINDOW_BLOCK_INDEXES =  [0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 16, 18, 19, 21, 22]
+    # support EVA01 backbone
+    cfg.MODEL.EVA01 = CN()  # large as base
+    #### large
+    cfg.MODEL.EVA01.PRETRAINED_WEIGHT = None
+    cfg.MODEL.EVA01.BEIT_LIKE_QKV_BIAS = True
+    cfg.MODEL.EVA01.BEIT_LIKE_GAMMA = False
+    cfg.MODEL.EVA01.FREEZE_PATH_EMBED = True
+    cfg.MODEL.EVA01.IMAGE_SIZE =  1280  # only for correct dim in pos embed
+    cfg.MODEL.EVA01.PATCH_SIZE =  16
+    cfg.MODEL.EVA01.WINDOW_SIZE =  16
+    cfg.MODEL.EVA01.DMBED_DIM = 1408
+    cfg.MODEL.EVA01.DEPTH =  40
+    cfg.MODEL.EVA01.NUM_HEADS =  16
+    cfg.MODEL.EVA01.MLP_RATIO =   6144 / 1408
+    cfg.MODEL.EVA01.DROP_PATH_RATE =  0.6
+    cfg.MODEL.EVA01.WINDOW_BLOCK_INDEXES =  [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38]

GLEE/glee/config_deeplab.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+def add_deeplab_config(cfg):
+    """
+    Add config for DeepLab.
+    """
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Used for `poly` learning rate schedule.
+    cfg.SOLVER.POLY_LR_POWER = 0.9
+    cfg.SOLVER.POLY_LR_CONSTANT_ENDING = 0.0
+    # Loss type, choose from `cross_entropy`, `hard_pixel_mining`.
+    cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE = "hard_pixel_mining"
+    # DeepLab settings
+    cfg.MODEL.SEM_SEG_HEAD.PROJECT_FEATURES = ["res2"]
+    cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS = [48]
+    cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS = 256
+    cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS = [6, 12, 18]
+    cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT = 0.1
+    cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV = False
+    # Backbone new configs
+    cfg.MODEL.RESNETS.RES4_DILATION = 1
+    cfg.MODEL.RESNETS.RES5_MULTI_GRID = [1, 2, 4]
+    # ResNet stem type from: `basic`, `deeplab`
+    cfg.MODEL.RESNETS.STEM_TYPE = "deeplab"

GLEE/glee/models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

GLEE/glee/models/glee_model.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+# from ..backbone import build_backbone, Backbone
+# from ..body.encoder import build_encoder
+# from ..body.decoder import build_decoder
+from detectron2.modeling import  build_backbone
+from .pixel_decoder.maskdino_encoder import build_pixel_decoder
+from .transformer_decoder.maskdino_decoder import build_transformer_decoder
+import random
+from transformers import AutoTokenizer
+from collections import OrderedDict
+from ..modules.point_features import point_sample
+from timm.models.layers import trunc_normal_
+from transformers import CLIPTokenizer,CLIPTextModel
+from .vos_utils import masks_to_boxes, FeatureFuser
+import numpy as np
+import math
+def rand_sample(x, max_len):
+    if x.shape[1] <= max_len:
+        return x
+    else:
+        rand_idx = torch.randperm(x.shape[1])[:max_len]
+        return x[:,rand_idx]
+def agg_lang_feat(features, mask, pool_type="average"):
+    """average pooling of language features"""
+    # feat: (bs, seq_len, C)
+    # mask: (bs, seq_len)
+    if pool_type == "average":
+        embedded = features * mask.unsqueeze(-1).float() # use mask to zero out invalid token features
+        aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float())
+    elif pool_type == "max":
+        out = []
+        for i in range(len(features)):
+            pool_feat, _ = torch.max(features[i][mask[i]], 0) # (L, C) -> (C, )
+            out.append(pool_feat)
+        aggregate = torch.stack(out, dim=0) # (bs, C)
+    else:
+        raise ValueError("pool_type should be average or max")
+    return aggregate
+class GLEE_Model(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    def __init__(self, cfg, matcher, device, video_info, contras_mean):
+        super().__init__()
+        self.cfg = cfg
+        self.matcher = matcher
+        self.backbone = build_backbone(cfg)
+        output_channels = [v for k,v in self.backbone._out_feature_channels.items()]
+        self.sot_fuser = FeatureFuser(output_channels[-3:], 256)
+        self.tokenizer = CLIPTokenizer.from_pretrained('GLEE/clip_vit_base_patch32')
+        self.tokenizer.add_special_tokens({'cls_token': self.tokenizer.eos_token})
+        self.text_encoder = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
+        # self.text_encoder_teacher = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
+        self.lang_encoder = None
+        # for p in self.text_encoder_teacher.parameters():
+            # p.requires_grad = False
+        self.lang_projection = nn.Parameter(torch.rand(cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM, cfg.MODEL.DIM_PROJ))
+        self.text_encode_type = 'clip_teacher'
+        # self.lang_encoder = None
+        self.pixel_decoder = build_pixel_decoder(cfg, self.backbone.output_shape())
+        transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        self.predictor = build_transformer_decoder(cfg, transformer_predictor_in_channels, lang_encoder = self.lang_encoder, mask_classification=True,)
+        self.to(device)
+        self.video_info = video_info
+        self.contras_mean = contras_mean
+        self.track_loss_version = cfg.MODEL.TRACK_VERSION
+        self.no_mask_tasks = ['obj365', 'obj365_clip','openimage', 'openimage_clip', 'vg', 'grit', 'bdd_det', 'bdd_track_box']
+        # for visual prompt
+        hidden_dim = 256
+        self.max_spatial_len = [512,512,512,512]
+        self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(4)])
+        trunc_normal_(self.mask_sptial_embed[0], std=.02)
+        trunc_normal_(self.mask_sptial_embed[1], std=.02)
+        trunc_normal_(self.mask_sptial_embed[2], std=.02)
+        trunc_normal_(self.mask_sptial_embed[3], std=.02)
+        # learnable positive negative indicator
+        self.pn_indicator = nn.Embedding(2, hidden_dim)
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, images, prompts, task, targets=None, batch_name_list=None, is_train = True, visual_prompt_type='scribble'):
+        extra =  {}
+        # dist_loss = None
+        early_semantic = None
+        if self.text_encode_type == "clip_teacher":
+            if task not in ['grounding','rvos']:
+                assert batch_name_list
+                calsses_name_list = batch_name_list
+                tokenized = self.tokenizer.batch_encode_plus(calsses_name_list,
+                        max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, # 256
+                        padding='max_length' if self.cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX else "longest", # max_length
+                        return_special_tokens_mask=True,
+                        return_tensors='pt',
+                        truncation=True).to(images.device)
+                texts = (tokenized['input_ids'], tokenized['attention_mask'])
+                token_x = self.text_encoder(*texts)['last_hidden_state']
+                valid_mask = tokenized['attention_mask'].bool()
+                # token_x_teacher = self.text_encoder_teacher(*texts)['last_hidden_state']
+                # if is_train:
+                # dist_loss =  F.mse_loss(token_x[valid_mask], token_x_teacher[valid_mask] )
+                    # F.l2_loss(token_x[valid_mask], token_x_teacher[valid_mask] )
+                token_x = token_x @ self.lang_projection
+                lang_feat_pool = agg_lang_feat(token_x, tokenized['attention_mask'], pool_type="average")  # (bs,  768)
+                extra['class_embeddings'] = lang_feat_pool
+                if True: # early_fusion
+                    gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
+                    gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
+                    gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0  #[bs,L]
+                    early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask}
+        if 'grounding' in prompts:
+            if self.text_encode_type == 'clip_frozen' or self.text_encode_type == 'clip_teacher':
+                tokens = self.tokenizer(
+                    prompts['grounding'], padding='max_length', truncation=True, max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, return_tensors='pt'
+                    )
+                tokens = {key: value.to(images.device) for key, value in tokens.items()}
+                texts = (tokens['input_ids'], tokens['attention_mask'])
+                x = self.text_encoder(*texts)
+                token_x = x['last_hidden_state']
+                token_x = token_x @ self.lang_projection
+                extra['grounding_tokens'] = token_x.permute(1,0,2) #[len,bz,C]
+                non_zero_query_mask = tokens['attention_mask']
+                lang_feat_pool = agg_lang_feat(token_x, non_zero_query_mask, pool_type="average").unsqueeze(1) # (bs, 1, 768)
+                dist_loss =  (lang_feat_pool*0).sum()
+                extra['grounding_nonzero_mask'] = ~non_zero_query_mask.bool()  # [bz,len]
+                extra['grounding_class'] = lang_feat_pool.squeeze(1) #[bz,C
+                # gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
+                # gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
+                # gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0  #[bs,L]
+                # early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask}
+                early_semantic = {"hidden":token_x.float(),"masks":tokens['attention_mask']>0}
+        if isinstance(images,torch.Tensor):
+            features = self.backbone(images)
+        else:
+            features = self.backbone(images.tensor)
+        if 'spatial' in prompts:
+            ## setp 1,2,3
+            key_images = [ images ]  #bz*[1,3,H,W]
+            key_promptmasks = [m.unsqueeze(0) for m in prompts['spatial']] #bz*[1,1,H,W]
+            prompt_mode = visual_prompt_type
+            ref_feats, ref_masks = self.get_template(key_images, key_promptmasks, prompt_mode)
+            early_fusion = {"hidden":ref_feats,"masks":ref_masks}
+            if early_semantic is None:
+                early_semantic = early_fusion
+            else:
+                early_semantic["hidden"] = torch.cat([early_semantic["hidden"],early_fusion["hidden"]],dim=1)
+                early_semantic["masks"] = torch.cat([early_semantic["masks"],early_fusion["masks"]],dim=1)
+        # bz = len(images)//2
+        mask_features, _, multi_scale_features, zero_loss = self.pixel_decoder.forward_features(features, masks=None, early_fusion = early_semantic)
+        if 'spatial' in prompts:
+            pos_masks = prompts['spatial']
+            # neg_masks = [~p for p in prompts['spatial']]
+            neg_masks = [p&False for p in prompts['spatial']]
+            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
+            _,h,w = extra['spatial_query_pos_mask'][0].shape
+            divisor = torch.tensor([h,w], device=mask_features.device)[None,]
+            # Get mean pos spatial query
+            non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
+            non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)
+            non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)
+            spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True) #[(N, C, P)
+            spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num() # [1,bz,C]
+            # Get mean neg spatial query
+            non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
+            non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
+            non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
+            spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
+            spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num()
+            # Get layerwise spatial query
+            src_spatial_queries = []
+            src_spatial_maskings = []
+            for i in range(len(multi_scale_features)):
+                bs,dc,h,w = multi_scale_features[i].shape
+                # src_mask_features = multi_scale_features[i].view(h,w,bs,dc)
+                src_mask_features = multi_scale_features[i].permute(2,3,0,1)
+                src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
+                non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
+                non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
+                non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
+                pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
+                non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
+                non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
+                non_zero_query_point[non_zero_query_mask] = 0
+                spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
+                spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
+                spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
+                src_spatial_queries += [spatial_tokens]
+                src_spatial_maskings += [non_zero_query_mask]
+            extra['visual_prompt_tokens'] = src_spatial_queries #[len,bz,C]
+            extra['visual_prompt_nonzero_mask'] = src_spatial_maskings  # [bz,len]
+        outputs = self.predictor(multi_scale_features, mask_features, extra=extra, task=task, masks=None, targets=targets)
+        return  outputs
+    def get_template(self, imgs, pad_masks, prompt_mode='scribble'):
+        """img: (N, 3, H, W), mask: (N, 1, H, W), bbox: (1, 4)"""
+        """get 4-channel template"""
+        croped_img_with_mask = []
+        for image_i, mask_i in zip( imgs, pad_masks):
+            if prompt_mode in ['scribble','point']:
+                image_with_mask = image_i + mask_i.to(image_i)
+            else:
+                image_with_mask = image_i
+            # image_with_mask = torch.cat([image_i,mask_i.to(image_i)],dim=1) #[1,3,H,W]
+            box_i = masks_to_boxes(mask_i[0])  #[xyxy]
+            box_i[:, 2:] = box_i[:, 2:] - box_i[:, :2] #xywh
+            x, y, w, h = box_i[0].long().tolist()
+            self.search_area_factor=2
+            crop_sz = math.ceil(math.sqrt(w * h) * self.search_area_factor)
+            x1 = max(0,round(x + 0.5 * w - crop_sz * 0.5))
+            x2 = x1 + crop_sz
+            y1 = max(0,round(y + 0.5 * h - crop_sz * 0.5))
+            y2 = y1 + crop_sz
+            im_crop = image_with_mask[:, :, y1:y2, x1:x2]
+            # resize
+            if im_crop.shape[-1] ==0 or im_crop.shape[-2] ==0 :
+                im_crop = image_with_mask
+            im_crop = F.interpolate(im_crop, (256,256), mode='bilinear', align_corners=False)
+            croped_img_with_mask.append(im_crop)
+        croped_img_with_mask = torch.cat(croped_img_with_mask,dim=0) #[bz,3,256,256]
+        with torch.no_grad():
+            ref_srcs = self.backbone(croped_img_with_mask.contiguous())
+        ref_srcs = [v for k,v in ref_srcs.items()]
+        ref_feats = self.sot_fuser(ref_srcs[1:]).float() #[bz,256,32,32]
+        ref_feats = ref_feats.flatten(-2).permute(0, 2, 1) # (bs, L, C)
+        ref_masks = torch.ones_like(ref_feats[:,:,0])>0  #[bs,L]
+        return ref_feats, ref_masks

GLEE/glee/models/pixel_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) IDEA, Inc. and its affiliates.

GLEE/glee/models/pixel_decoder/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (184 Bytes). View file

GLEE/glee/models/pixel_decoder/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (154 Bytes). View file

GLEE/glee/models/pixel_decoder/__pycache__/early_fusion.cpython-38.pyc ADDED Viewed

Binary file (6.36 kB). View file

GLEE/glee/models/pixel_decoder/__pycache__/early_fusion.cpython-39.pyc ADDED Viewed

Binary file (6.3 kB). View file

GLEE/glee/models/pixel_decoder/__pycache__/maskdino_encoder.cpython-38.pyc ADDED Viewed

Binary file (15.4 kB). View file

GLEE/glee/models/pixel_decoder/__pycache__/maskdino_encoder.cpython-39.pyc ADDED Viewed

Binary file (15.3 kB). View file

GLEE/glee/models/pixel_decoder/__pycache__/position_encoding.cpython-38.pyc ADDED Viewed

Binary file (2.64 kB). View file

GLEE/glee/models/pixel_decoder/__pycache__/position_encoding.cpython-39.pyc ADDED Viewed

Binary file (2.6 kB). View file

GLEE/glee/models/pixel_decoder/early_fusion.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from timm.models.layers import DropPath
+class VLFuse(torch.nn.Module):
+    """
+    Early Fusion Module
+    """
+    def __init__(self, ):
+        super(VLFuse, self).__init__()
+        self.init_configs()
+        # early fusion module
+        # bi-direction (text->image, image->text)
+        self.b_attn = BiAttentionBlockForCheckpoint(v_dim=self.img_dim, # 256
+                    l_dim=self.lang_dim, # 768
+                    embed_dim=self.embed_dim, # 2048
+                    num_heads=self.n_head, # 8
+                    dropout=0.1,
+                    drop_path=.0,
+                    init_values=1.0 / 6,
+                    )
+    def init_configs(self, ):
+        # common params
+        self.img_dim =  256
+        self.max_query_len = 256
+        self.n_layers =1
+        # mha params
+        self.n_head = 8
+        self.embed_dim = 2048 # 2048 by default
+        self.lang_dim = 256
+    def forward(self, x, task=None):
+        visual_features = x["visual"]
+        language_dict_features = x["lang"]
+        fused_visual_features, language_features = self.b_attn(
+                visual_features, language_dict_features['hidden'], language_dict_features['masks'], task)
+        language_dict_features['hidden'] = language_features
+        fused_language_dict_features = language_dict_features
+        features_dict = {"visual": fused_visual_features,
+                         "lang": fused_language_dict_features}
+        return features_dict
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1):
+        super(BiMultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+        self.stable_softmax_2d =  False
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+        self._reset_parameters()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+    def forward(self, v, l, attention_mask_l=None):
+        bsz, tgt_len, embed_dim = v.size()
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim) # (bs * 8, -1, embed_dim//8)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) # (bs * 8, seq_len_img, embed_dim//8)
+        key_states = key_states.view(*proj_shape) # (bs * 8, seq_len_text, embed_dim//8)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # (bs * 8, seq_len_img, seq_len_text)
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        # attn_weights_l = nn.functional.softmax(attn_weights.transpose(1, 2), dim=-1)
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
+            0])
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+        # assert attention_mask_l.dtype == torch.int64
+        if attention_mask_l is not None:
+            assert (attention_mask_l.dim() == 2) # (bs, seq_len)
+            attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1) # (bs, 1, 1, seq_len)
+            attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len)
+            attention_mask = attention_mask.masked_fill(attention_mask == 0, -9e15)
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights_v = nn.functional.softmax(attn_weights, dim=-1)
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+        return attn_output_v, attn_output_l
+class BiAttentionBlockForCheckpoint(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
+                 drop_path=.0, init_values=1e-4,  ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlockForCheckpoint, self).__init__()
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(v_dim=v_dim,
+                                         l_dim=l_dim,
+                                         embed_dim=embed_dim,
+                                         num_heads=num_heads,
+                                         dropout=dropout,
+                                        )
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True)
+    def forward(self, v, l, attention_mask_l=None, task=None):
+        # v: visual features, (bs, sigma(HW), 256)
+        # l: language features, (bs, seq_len, 768)
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_l=attention_mask_l)
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l

GLEE/glee/models/pixel_decoder/maskdino_encoder.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified by Feng Li and Hao Zhang.
+import logging
+import numpy as np
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from torch.cuda.amp import autocast
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from .position_encoding import PositionEmbeddingSine
+from ...utils.utils import _get_clones, _get_clones_advanced, _get_activation_fn
+from .ops.modules import MSDeformAttn
+from .early_fusion import VLFuse
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.MaskDINO.PIXEL_DECODER_NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+# MSDeformAttn Transformer encoder in deformable detr
+class MSDeformAttnTransformerEncoderOnly(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu",
+                 num_feature_levels=4, enc_n_points=4,):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        vl_fusion_layer = VLFuse()
+        encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward,
+                                                            dropout, activation,
+                                                            num_feature_levels, nhead, enc_n_points)
+        self.encoder = MSDeformAttnTransformerEncoder(vl_fusion_layer, encoder_layer, num_encoder_layers)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        normal_(self.level_embed)
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def forward(self, srcs, masks, pos_embeds, early_fusion=None):
+        enable_mask=0
+        if masks is not None:
+            for src in srcs:
+                if src.size(2)%32 or src.size(3)%32:
+                    enable_mask = 1
+        if enable_mask==0:
+            masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # encoder
+        memory, zero_loss = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten, early_fusion)
+        return memory, spatial_shapes, level_start_index, zero_loss
+class MSDeformAttnTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
+        # self attention
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        return src
+class MSDeformAttnTransformerEncoder(nn.Module):
+    def __init__(self, vl_fusion_layer, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.vl_layers = _get_clones_advanced(vl_fusion_layer, num_layers, 1)
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None, early_fusion=None):
+        if early_fusion:
+            output = {"visual": src, "lang": early_fusion}
+        else:
+            output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, (layer,vl_layer) in enumerate(zip(self.layers, self.vl_layers)):
+            if early_fusion:
+                output = vl_layer(output)
+                output["visual"] = layer(output["visual"], pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+            else:
+                output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+        if early_fusion:
+            return output["visual"] ,  (output['lang']['hidden']*0).sum()
+        else:
+            return output, None
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskDINOEncoder(nn.Module):
+    """
+    This is the multi-scale encoder in detection models, also named as pixel decoder in segmentation models.
+    """
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+        # deformable transformer encoder args
+        transformer_in_features: List[str],
+        common_stride: int,
+        num_feature_levels: int,
+        total_num_feature_levels: int,
+        feature_order: str,
+        ViTBackbone: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+            num_feature_levels: feature scales used
+            total_num_feature_levels: total feautre scales used (include the downsampled features)
+            feature_order: 'low2high' or 'high2low', i.e., 'low2high' means low-resolution features are put in the first.
+        """
+        super().__init__()
+        transformer_input_shape = {
+            k: v for k, v in input_shape.items() if k in transformer_in_features
+        }
+        # this is the input shape of pixel decoder
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        self.feature_strides = [v.stride for k, v in input_shape]
+        self.feature_channels = [v.channels for k, v in input_shape]
+        self.feature_order = feature_order
+        if feature_order == "low2high":
+            transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: -x[1].stride)
+        else:
+            transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
+        self.transformer_in_features = [k for k, v in transformer_input_shape]  # starting from "res2" to "res5"
+        transformer_in_channels = [v.channels for k, v in transformer_input_shape]
+        self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape]  # to decide extra FPN layers
+        self.maskdino_num_feature_levels = num_feature_levels  # always use 3 scales
+        self.total_num_feature_levels = total_num_feature_levels
+        self.common_stride = common_stride
+        self.transformer_num_feature_levels = len(self.transformer_in_features)
+        self.low_resolution_index = transformer_in_channels.index(max(transformer_in_channels))
+        self.high_resolution_index = 0 if self.feature_order == 'low2high' else -1
+        self.isViTBackbone = ViTBackbone
+        if not ViTBackbone:
+            if self.transformer_num_feature_levels > 1:
+                input_proj_list = []
+                for in_channels in transformer_in_channels[::-1]:
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, conv_dim),
+                    ))
+                # input projectino for downsample
+                in_channels = max(transformer_in_channels)
+                for _ in range(self.total_num_feature_levels - self.transformer_num_feature_levels):  # exclude the res2
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, conv_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, conv_dim),
+                    ))
+                    in_channels = conv_dim
+                self.input_proj = nn.ModuleList(input_proj_list)
+            else:
+                self.input_proj = nn.ModuleList([
+                    nn.Sequential(
+                        nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, conv_dim),
+                    )])
+            for proj in self.input_proj:
+                nn.init.xavier_uniform_(proj[0].weight, gain=1)
+                nn.init.constant_(proj[0].bias, 0)
+        self.transformer = MSDeformAttnTransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            num_feature_levels=self.total_num_feature_levels,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        self.mask_dim = mask_dim
+        # use 1x1 conv instead
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = max(int(np.log2(stride) - np.log2(self.common_stride)), 1)
+        lateral_convs = []
+        output_convs = []
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
+            lateral_norm = get_norm(norm, conv_dim)
+            output_norm = get_norm(norm, conv_dim)
+            lateral_conv = Conv2d(
+                in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                conv_dim,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+            self.add_module("layer_{}".format(idx + 1), output_conv)
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = {}
+        ret["input_shape"] = {
+            k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        }
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
+        ret["transformer_dropout"] = cfg.MODEL.MaskDINO.DROPOUT
+        ret["transformer_nheads"] = cfg.MODEL.MaskDINO.NHEADS
+        ret["transformer_dim_feedforward"] = cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD  # deformable transformer encoder
+        ret[
+            "transformer_enc_layers"
+        ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS  # a separate config
+        ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES  # ['res3', 'res4', 'res5']
+        ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
+        ret["total_num_feature_levels"] = cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS
+        ret["num_feature_levels"] = cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS
+        ret["feature_order"] = cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER
+        ret["ViTBackbone"] = cfg.MODEL.BACKBONE.NAME  in ['D2_EVA02', 'D2_EVA01' , 'D2_ViT']
+        return ret
+    @autocast(enabled=False)
+    def forward_features(self, features, masks, early_fusion=None):
+        """
+        :param features: multi-scale features from the backbone
+        :param masks: image mask
+        :return: enhanced multi-scale features and mask feature (1/4 resolution) for the decoder to produce binary mask
+        """
+        # backbone features
+        srcs = []
+        pos = []
+        # additional downsampled features
+        srcsl = []
+        posl = []
+        if self.isViTBackbone:
+            for idx, f in enumerate(self.transformer_in_features[::-1]):
+                x = features[f].float()  # deformable detr does not support half precision
+                srcs.append(x)
+                pos.append(self.pe_layer(x))
+            if self.feature_order != 'low2high':
+                srcs = srcs[::-1]
+                pos = pos[::-1]
+        else:
+            if self.total_num_feature_levels > self.transformer_num_feature_levels:
+                smallest_feat = features[self.transformer_in_features[self.low_resolution_index]].float()
+                _len_srcs = self.transformer_num_feature_levels
+                for l in range(_len_srcs, self.total_num_feature_levels):
+                    if l == _len_srcs:
+                        src = self.input_proj[l](smallest_feat)
+                    else:
+                        src = self.input_proj[l](srcsl[-1])
+                    srcsl.append(src)
+                    posl.append(self.pe_layer(src))
+            srcsl = srcsl[::-1]
+            # Reverse feature maps
+            for idx, f in enumerate(self.transformer_in_features[::-1]):
+                x = features[f].float()  # deformable detr does not support half precision
+                srcs.append(self.input_proj[idx](x))
+                pos.append(self.pe_layer(x))
+            srcs.extend(srcsl) if self.feature_order == 'low2high' else srcsl.extend(srcs)
+            pos.extend(posl) if self.feature_order == 'low2high' else posl.extend(pos)
+            if self.feature_order != 'low2high':
+                srcs = srcsl
+                pos = posl
+        y, spatial_shapes, level_start_index, zero_loss = self.transformer(srcs, masks, pos, early_fusion)
+        bs = y.shape[0]
+        split_size_or_sections = [None] * self.total_num_feature_levels
+        for i in range(self.total_num_feature_levels):
+            if i < self.total_num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
+            x = features[f].float()
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(x)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + F.interpolate(out[self.high_resolution_index], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
+            y = output_conv(y)
+            out.append(y)
+        for o in out:
+            if num_cur_levels < self.total_num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+        return self.mask_features(out[-1]), out[0], multi_scale_features, zero_loss

GLEE/glee/models/pixel_decoder/ops/functions/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from .ms_deform_attn_func import MSDeformAttnFunction

GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (262 Bytes). View file

GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (232 Bytes). View file

GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc ADDED Viewed

Binary file (2.7 kB). View file

GLEE/glee/models/pixel_decoder/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc ADDED Viewed

Binary file (2.64 kB). View file

GLEE/glee/models/pixel_decoder/ops/functions/ms_deform_attn_func.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+try:
+    import MultiScaleDeformableAttention as MSDA
+except ModuleNotFoundError as e:
+    info_string = (
+        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
+        "\t`cd maskdino/modeling/pixel_decoder/ops`\n"
+        "\t`sh make.sh`\n"
+    )
+    # raise ModuleNotFoundError(info_string)
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()