SViTT-Ego_Multiple_Choice_Question / configs /ego_mcq /multiple-choice-question.yaml
hvaldez's picture
first commit
a17aefb verified
raw
history blame contribute delete
770 Bytes
text_encoder: bert-base-uncased
bert_config: configs/config_bert.json
vit_type: beit # items in ${vit_zoo}
vit_zoo: # from huggingface
beit: microsoft/beit-base-patch16-224-pt22k-ft22k
vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
vision_encoder_args:
token_keep_rate: 0.7
token_keep_strategy: cls_attn
token_drop_loc: [3, 6, 9]
sparse_local_attn: 1
sparse_random_attn: 5
attn_block_size: 56
image_res: 224
embed_dim: 256
video_input:
num_frames: 4
reader: decord # one of [decord, av]
sample_type: rand
num_frames_test: 16 # num_frames during inference/test
sample_type_test: middle
max_txt_l:
image: 32
video: 32
batch_size:
image: 8
video: 8
batch_size_test:
image: 8
video: 8
k_test: 128
temp: 0.18
mlm_prob: 0.5