from functools import partial import torch.nn as nn from detectron2.config import LazyCall as L from detectron2.modeling import ViT, SimpleFeaturePyramid from detectron2.modeling.backbone.fpn import LastLevelMaxPool from models.mask_rcnn_fpn_v2 import model, constants model.pixel_mean = constants['imagenet_rgb256_mean'] model.pixel_std = constants['imagenet_rgb256_std'] model.input_format = "RGB" # Base embed_dim, depth, num_heads, dp = 768, 12, 12, 0.1 # Creates Simple Feature Pyramid from ViT backbone model.backbone = L(SimpleFeaturePyramid)( net=L(ViT)( # Single-scale ViT backbone img_size=1024, patch_size=16, embed_dim=embed_dim, depth=depth, num_heads=num_heads, drop_path_rate=dp, window_size=14, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_block_indexes=[ # 2, 5, 8 11 for global attention 0, 1, 3, 4, 6, 7, 9, 10, ], residual_block_indexes=[], use_rel_pos=True, out_feature="last_feat", ), in_feature="${.net.out_feature}", out_channels=256, scale_factors=(4.0, 2.0, 1.0, 0.5), top_block=L(LastLevelMaxPool)(), norm="LN", square_pad=512, ) model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN" # 2conv in RPN: model.proposal_generator.head.conv_dims = [-1, -1] # 4conv1fc box head model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] model.roi_heads.box_head.fc_dims = [1024]