text_encoder: bert-base-uncased bert_config: configs/config_bert.json vit_type: beit # items in ${vit_zoo} vit_zoo: # from huggingface beit: microsoft/beit-base-patch16-224-pt22k-ft22k vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]} vision_encoder_args: token_keep_rate: 0.7 token_keep_strategy: cls_attn token_drop_loc: [3, 6, 9] sparse_local_attn: 1 sparse_random_attn: 5 attn_block_size: 56 image_res: 224 embed_dim: 256 video_input: num_frames: 4 reader: decord # one of [decord, av] sample_type: rand num_frames_test: 16 # num_frames during inference/test sample_type_test: middle max_txt_l: image: 32 video: 32 batch_size: image: 8 video: 8 batch_size_test: image: 8 video: 8 k_test: 128 temp: 0.18 mlm_prob: 0.5