DEFAULT_TEST_DATASET = dict( flickr=dict( filename='./reactiondata/real_test.jsonl', image_folder='./reaction_image', template_file='./config/_base_/dataset/template/reaction.json', type='FlickrDataset'), reg=dict( filename='./reactiondata/train_OCR.jsonl', image_folder='./reaction_image_OCR', template_file='./config/_base_/dataset/template/OCR.json', type='REGDataset')) DEFAULT_TRAIN_DATASET = dict( flickr=dict( filename='./reactiondata/reaction_real_structed.jsonl', image_folder='./reaction_image', template_file='./config/_base_/dataset/template/reaction.json', type='FlickrDataset'), reg=dict( filename='./reactiondata/train_OCR.jsonl', image_folder='./reaction_image_OCR', template_file='./config/_base_/dataset/template/OCR.json', type='REGDataset')) data_args = dict( collator_kwargs=dict(max_length=1024, padding=True), compute_metric=None, gen_kwargs=dict(max_new_tokens=1024, num_beams=1), test=None, train=dict( cfgs=[ dict( filename='./reactiondata/train_OCR.jsonl', image_folder='./reaction_image_OCR', template_file='./config/_base_/dataset/template/OCR.json', type='REGDataset'), dict( filename='./reactiondata/reaction_real_structed.jsonl', image_folder='./reaction_image', template_file='./config/_base_/dataset/template/reaction.json', type='FlickrDataset'), ], probabilities=[ 0.0, 1, ], seed=None, stopping_strategy='first_exhausted', type='InterleaveDateset'), validation=dict( cfgs=[ dict( filename='./reactiondata/real_test.jsonl', image_folder='./reaction_image', template_file='./config/_base_/dataset/template/reaction.json', type='FlickrDataset'), ], type='ConcatDatasetWithShuffle')) model_args = dict( cache_dir=None, conv_args=dict( conv_template='vicuna_v1.1', tokenize_kwargs=dict(truncation_size=2048)), freeze_backbone=False, freeze_mm_mlp_adapter=False, gen_kwargs_set_bos_token_id=True, gen_kwargs_set_eos_token_id=True, gen_kwargs_set_pad_token_id=True, image_token_len=300, mm_use_im_start_end=True, mm_vision_select_layer=-2, model_max_length=2048, model_name_or_path='./exp/reaction_4.2.1', pretrain_mm_mlp_adapter=None, process_func_args=dict( conv=dict(type='ShikraConvProcess'), image=dict(type='ShikraImageProcessor'), target=dict(type='BoxFormatProcess'), text=dict(type='ShikraTextProcess')), sep_image_conv_front=False, target_processor=dict(boxes=dict(type='PlainBoxFormatter')), tune_mm_mlp_adapter=False, type='shikra', version='v1', vision_tower='SenseTime/deformable-detr') training_args = dict( bf16=True, dataloader_num_workers=4, do_eval=False, do_predict=False, do_train=True, evaluation_strategy='no', fsdp='full_shard auto_wrap', fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', gradient_accumulation_steps=1, gradient_checkpointing=True, learning_rate=2e-05, logging_steps=10, lr_scheduler_type='cosine', num_train_epochs=50, output_dir='./exp/reaction_4.2.2-large', overwrite_output_dir=False, per_device_eval_batch_size=4, per_device_train_batch_size=4, predict_with_generate=True, remove_unused_columns=False, report_to='none', save_steps=10000, save_strategy='steps', save_total_limit=1, seed=42, tf32=True, warmup_ratio=0.03, weight_decay=0.05)