Spaces:
Sleeping
Sleeping
File size: 3,710 Bytes
749745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_tiny_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
FREEZE_CONV_BODY_AT: -1
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
FUSE_CONFIG:
EARLY_FUSE_ON: True
TYPE: "MHA-B"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_FUSED_FEATURES_DOT_PRODUCT: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
USE_CHECKPOINT: True
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
# use for grounding model
DATASETS:
REGISTER:
mixed_train_no_coco_noun:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
mixed_train_no_coco_gpt:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
flickr30k_train_gpt:
img_folder: "flickr30k/flickr30k_images/train"
ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
is_train: True
mixed_train_no_coco_noun_gpt:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
mixed_train_no_coco_noun_gpt_0422:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0422.json"
mixed_train_no_coco_noun_gpt_0425:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
flickr30k_train_gpt_0425:
img_folder: "flickr30k/flickr30k_images/train"
ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
is_train: True
TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
TEST: ("coco_2017_val", )
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json"
SEPARATION_TOKENS: ". "
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.0001
STEPS: (0.67, 0.89)
MAX_EPOCH: 30
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
USE_AMP: True
MODEL_EMA: 0.999
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0 |