File size: 770 Bytes
a17aefb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

text_encoder: bert-base-uncased
bert_config: configs/config_bert.json
vit_type: beit  # items in ${vit_zoo}
vit_zoo:  # from huggingface
  beit: microsoft/beit-base-patch16-224-pt22k-ft22k
vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}

vision_encoder_args:
  token_keep_rate: 0.7
  token_keep_strategy: cls_attn
  token_drop_loc: [3, 6, 9]
  sparse_local_attn: 1
  sparse_random_attn: 5
  attn_block_size: 56

image_res: 224
embed_dim: 256
video_input:
  num_frames: 4
  reader: decord  # one of [decord, av]
  sample_type: rand
  num_frames_test: 16  # num_frames during inference/test
  sample_type_test: middle
max_txt_l:
  image: 32
  video: 32
  
batch_size:
  image: 8
  video: 8
batch_size_test:
  image: 8
  video: 8
k_test: 128
temp: 0.18
mlm_prob: 0.5