wchai commited on
Commit
c7b2255
1 Parent(s): 38cb1a6

Upload 3_8b_image/xtuner_config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. 3_8b_image/xtuner_config.py +793 -0
3_8b_image/xtuner_config.py ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM = ''
2
+ accumulative_counts = 96
3
+ allava_cl_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Caption-LAION-4V.jsonl'
4
+ allava_cl_dataset = dict(
5
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
6
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
7
+ image_processor=dict(
8
+ crop_size=378,
9
+ pretrained_model_name_or_path=
10
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
11
+ size=378,
12
+ trust_remote_code=True,
13
+ type='transformers.CLIPImageProcessor.from_pretrained'),
14
+ max_length=4096,
15
+ offline_processed_text_folder=
16
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31',
17
+ pad_image_to_square=True,
18
+ template_map_fn=dict(
19
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
20
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
21
+ type='xtuner.dataset.CambrianDataset')
22
+ allava_cl_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V'
23
+ allava_cv_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Caption-VFLAN-4V.jsonl'
24
+ allava_cv_dataset = dict(
25
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
26
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
27
+ image_processor=dict(
28
+ crop_size=378,
29
+ pretrained_model_name_or_path=
30
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
31
+ size=378,
32
+ trust_remote_code=True,
33
+ type='transformers.CLIPImageProcessor.from_pretrained'),
34
+ max_length=4096,
35
+ offline_processed_text_folder=
36
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31',
37
+ pad_image_to_square=True,
38
+ template_map_fn=dict(
39
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
40
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
41
+ type='xtuner.dataset.CambrianDataset')
42
+ allava_data_root = '/data/wenhao/projects/xtuner/data/ALLaVA-4V'
43
+ allava_il_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Instruct-LAION-4V.jsonl'
44
+ allava_il_dataset = dict(
45
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
46
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
47
+ image_processor=dict(
48
+ crop_size=378,
49
+ pretrained_model_name_or_path=
50
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
51
+ size=378,
52
+ trust_remote_code=True,
53
+ type='transformers.CLIPImageProcessor.from_pretrained'),
54
+ max_length=4096,
55
+ offline_processed_text_folder=
56
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31',
57
+ pad_image_to_square=True,
58
+ template_map_fn=dict(
59
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
60
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
61
+ type='xtuner.dataset.CambrianDataset')
62
+ allava_il_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V'
63
+ allava_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V'
64
+ allava_iv_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Instruct-VFLAN-4V.jsonl'
65
+ allava_iv_dataset = dict(
66
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
67
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
68
+ image_processor=dict(
69
+ crop_size=378,
70
+ pretrained_model_name_or_path=
71
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
72
+ size=378,
73
+ trust_remote_code=True,
74
+ type='transformers.CLIPImageProcessor.from_pretrained'),
75
+ max_length=4096,
76
+ offline_processed_text_folder=
77
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31',
78
+ pad_image_to_square=True,
79
+ template_map_fn=dict(
80
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
81
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
82
+ type='xtuner.dataset.CambrianDataset')
83
+ batch_size = 1
84
+ betas = (
85
+ 0.9,
86
+ 0.999,
87
+ )
88
+ cambrian_data_path = '/data/wenhao/projects/xtuner/data/Cambrian-10M/jsons/Cambrian7M_withsystemprompt.jsonl'
89
+ cambrian_data_root = '/data/wenhao/projects/xtuner/data/Cambrian-10M/'
90
+ cambrian_dataset = dict(
91
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
92
+ image_folder='/data/wenhao/projects/xtuner/data/Cambrian-10M/',
93
+ image_processor=dict(
94
+ crop_size=378,
95
+ pretrained_model_name_or_path=
96
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
97
+ size=378,
98
+ trust_remote_code=True,
99
+ type='transformers.CLIPImageProcessor.from_pretrained'),
100
+ max_length=4096,
101
+ offline_processed_text_folder=
102
+ '/data/wenhao/projects/xtuner/data/Cambrian-10M/pre_token_llama3',
103
+ pad_image_to_square=True,
104
+ template_map_fn=dict(
105
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
106
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
107
+ type='xtuner.dataset.CambrianDataset')
108
+ cambrian_image_folder = '/data/wenhao/projects/xtuner/data/Cambrian-10M/'
109
+ cambrian_processed_text_folder = '/data/wenhao/projects/xtuner/data/Cambrian-10M/pre_token_llama3'
110
+ custom_hooks = [
111
+ dict(
112
+ tokenizer=dict(
113
+ padding_side='right',
114
+ pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct',
115
+ trust_remote_code=True,
116
+ type='transformers.AutoTokenizer.from_pretrained'),
117
+ type='xtuner.engine.DatasetInfoHook'),
118
+ dict(
119
+ evaluation_images='https://llava-vl.github.io/static/images/view.jpg',
120
+ evaluation_inputs=[
121
+ '请描述一下这张照片',
122
+ 'Please describe this picture',
123
+ ],
124
+ every_n_iters=100,
125
+ image_processor=dict(
126
+ crop_size=378,
127
+ pretrained_model_name_or_path=
128
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
129
+ size=378,
130
+ trust_remote_code=True,
131
+ type='transformers.CLIPImageProcessor.from_pretrained'),
132
+ prompt_template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
133
+ system='',
134
+ tokenizer=dict(
135
+ padding_side='right',
136
+ pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct',
137
+ trust_remote_code=True,
138
+ type='transformers.AutoTokenizer.from_pretrained'),
139
+ type='xtuner.engine.EvaluateChatHook'),
140
+ ]
141
+ dataloader_num_workers = 0
142
+ default_hooks = dict(
143
+ checkpoint=dict(
144
+ by_epoch=False,
145
+ interval=200,
146
+ max_keep_ckpts=2,
147
+ type='mmengine.hooks.CheckpointHook'),
148
+ logger=dict(interval=10, type='mmengine.hooks.LoggerHook'),
149
+ param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'),
150
+ sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'),
151
+ timer=dict(type='mmengine.hooks.IterTimerHook'))
152
+ dense_data_path = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/DenseFusion-1M/DenseFusion-1M-instruct.jsonl'
153
+ dense_data_root = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/'
154
+ dense_dataset = dict(
155
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
156
+ image_folder='/data/wenhao/projects/xtuner/data/DenseFusion-1M/1M_data',
157
+ image_processor=dict(
158
+ crop_size=378,
159
+ pretrained_model_name_or_path=
160
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
161
+ size=378,
162
+ trust_remote_code=True,
163
+ type='transformers.CLIPImageProcessor.from_pretrained'),
164
+ max_length=4096,
165
+ offline_processed_text_folder=
166
+ '/data/wenhao/projects/xtuner/data/DenseFusion-1M/pre_token_llama3',
167
+ pad_image_to_square=True,
168
+ template_map_fn=dict(
169
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
170
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
171
+ type='xtuner.dataset.CambrianDataset')
172
+ dense_image_folder = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/1M_data'
173
+ dense_processed_text_folder = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/pre_token_llama3'
174
+ env_cfg = dict(
175
+ cudnn_benchmark=False,
176
+ dist_cfg=dict(backend='nccl'),
177
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
178
+ evaluation_freq = 100
179
+ evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg'
180
+ evaluation_inputs = [
181
+ '请描述一下这张照片',
182
+ 'Please describe this picture',
183
+ ]
184
+ evol_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl'
185
+ evol_data_root = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/'
186
+ evol_dataset = dict(
187
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
188
+ image_folder=
189
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images',
190
+ image_processor=dict(
191
+ crop_size=378,
192
+ pretrained_model_name_or_path=
193
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
194
+ size=378,
195
+ trust_remote_code=True,
196
+ type='transformers.CLIPImageProcessor.from_pretrained'),
197
+ max_length=4096,
198
+ offline_processed_text_folder=
199
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31',
200
+ pad_image_to_square=True,
201
+ template_map_fn=dict(
202
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
203
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
204
+ type='xtuner.dataset.CambrianDataset')
205
+ evol_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images'
206
+ face_data_path = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/FaceCaption-100K.jsonl'
207
+ face_data_root = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/'
208
+ face_dataset = dict(
209
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
210
+ image_folder='/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data',
211
+ image_processor=dict(
212
+ crop_size=378,
213
+ pretrained_model_name_or_path=
214
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
215
+ size=378,
216
+ trust_remote_code=True,
217
+ type='transformers.CLIPImageProcessor.from_pretrained'),
218
+ max_length=4096,
219
+ offline_processed_text_folder=
220
+ '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3',
221
+ pad_image_to_square=True,
222
+ template_map_fn=dict(
223
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
224
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
225
+ type='xtuner.dataset.CambrianDataset')
226
+ face_image_folder = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data'
227
+ face_processed_text_folder = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3'
228
+ image_processor = dict(
229
+ crop_size=378,
230
+ pretrained_model_name_or_path='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
231
+ size=378,
232
+ trust_remote_code=True,
233
+ type='transformers.CLIPImageProcessor.from_pretrained')
234
+ launcher = 'slurm'
235
+ llava_mix_data_path = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/llava_v1_5_mix665k.jsonl'
236
+ llava_mix_data_root = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/'
237
+ llava_mix_dataset = dict(
238
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
239
+ image_folder='/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images',
240
+ image_processor=dict(
241
+ crop_size=378,
242
+ pretrained_model_name_or_path=
243
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
244
+ size=378,
245
+ trust_remote_code=True,
246
+ type='transformers.CLIPImageProcessor.from_pretrained'),
247
+ max_length=4096,
248
+ offline_processed_text_folder=
249
+ '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31',
250
+ pad_image_to_square=True,
251
+ template_map_fn=dict(
252
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
253
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
254
+ type='xtuner.dataset.CambrianDataset')
255
+ llava_mix_image_folder = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images'
256
+ llavanext_data_path = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/llava_next.jsonl'
257
+ llavanext_data_root = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/'
258
+ llavanext_dataset = dict(
259
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
260
+ image_folder='/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images',
261
+ image_processor=dict(
262
+ crop_size=378,
263
+ pretrained_model_name_or_path=
264
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
265
+ size=378,
266
+ trust_remote_code=True,
267
+ type='transformers.CLIPImageProcessor.from_pretrained'),
268
+ max_length=4096,
269
+ offline_processed_text_folder=
270
+ '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31',
271
+ pad_image_to_square=True,
272
+ template_map_fn=dict(
273
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
274
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
275
+ type='xtuner.dataset.CambrianDataset')
276
+ llavanext_image_folder = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images'
277
+ llm_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct'
278
+ load_from = 'work_dirs/new_image/iter_449600.pth'
279
+ log_level = 'INFO'
280
+ lr = 1e-05
281
+ m4_data_path = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/m4_instruct_image.jsonl'
282
+ m4_data_root = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/'
283
+ m4_dataset = dict(
284
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
285
+ image_folder='/data/wenhao/projects/xtuner/data/M4-Instruct-Data/',
286
+ image_processor=dict(
287
+ crop_size=378,
288
+ pretrained_model_name_or_path=
289
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
290
+ size=378,
291
+ trust_remote_code=True,
292
+ type='transformers.CLIPImageProcessor.from_pretrained'),
293
+ max_length=4096,
294
+ offline_processed_text_folder=
295
+ '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31',
296
+ pad_image_to_square=True,
297
+ template_map_fn=dict(
298
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
299
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
300
+ type='xtuner.dataset.CambrianDataset')
301
+ m4_image_folder = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/'
302
+ max_epochs = 1
303
+ max_length = 4096
304
+ max_norm = 1
305
+ model = dict(
306
+ freeze_llm=False,
307
+ freeze_visual_encoder=False,
308
+ llm=dict(
309
+ pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct',
310
+ torch_dtype='torch.float16',
311
+ trust_remote_code=True,
312
+ type='transformers.AutoModelForCausalLM.from_pretrained'),
313
+ llm_lora=dict(
314
+ bias='none',
315
+ lora_alpha=256,
316
+ lora_dropout=0.05,
317
+ r=512,
318
+ task_type='CAUSAL_LM',
319
+ type='peft.LoraConfig'),
320
+ pretrained_pth=
321
+ '/data/wenhao/projects/xtuner/work_dirs/final_new_v/projector',
322
+ type='xtuner.model.PikaModel',
323
+ visual_encoder=dict(
324
+ pretrained_model_name_or_path=
325
+ '/data/wenhao/projects/xtuner/work_dirs/final_new_v/visual_encoder',
326
+ type='xtuner.model.pika.PikaSigVidEncoder.from_pretrained',
327
+ visual_token_merge_ratio=0.1))
328
+ optim_type = 'torch.optim.AdamW'
329
+ optim_wrapper = dict(
330
+ optimizer=dict(
331
+ betas=(
332
+ 0.9,
333
+ 0.999,
334
+ ),
335
+ lr=1e-05,
336
+ type='torch.optim.AdamW',
337
+ weight_decay=0),
338
+ type='DeepSpeedOptimWrapper')
339
+ param_scheduler = [
340
+ dict(
341
+ begin=0,
342
+ by_epoch=True,
343
+ convert_to_iter_based=True,
344
+ end=0.03,
345
+ start_factor=1e-05,
346
+ type='mmengine.optim.LinearLR'),
347
+ dict(
348
+ T_max=1,
349
+ begin=0.03,
350
+ by_epoch=True,
351
+ convert_to_iter_based=True,
352
+ eta_min=0.0,
353
+ type='mmengine.optim.CosineAnnealingLR'),
354
+ ]
355
+ pretrained_pth = '/data/wenhao/projects/xtuner/work_dirs/final_new_v/projector'
356
+ prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.llama3_chat'
357
+ randomness = dict(deterministic=False, seed=1416244085)
358
+ resume = True
359
+ runner_type = 'FlexibleRunner'
360
+ save_steps = 200
361
+ save_total_limit = 2
362
+ sharegpt4v_data_path = '/data/wenhao/projects/xtuner/data/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl'
363
+ sharegpt4v_data_root = '/data/wenhao/projects/xtuner/data/ShareGPT4V'
364
+ sharegpt4v_dataset = dict(
365
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
366
+ image_folder='/data/wenhao/projects/xtuner/data',
367
+ image_processor=dict(
368
+ crop_size=378,
369
+ pretrained_model_name_or_path=
370
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
371
+ size=378,
372
+ trust_remote_code=True,
373
+ type='transformers.CLIPImageProcessor.from_pretrained'),
374
+ max_length=4096,
375
+ offline_processed_text_folder=
376
+ '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31',
377
+ pad_image_to_square=True,
378
+ template_map_fn=dict(
379
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
380
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
381
+ type='xtuner.dataset.CambrianDataset')
382
+ sharegpt4v_image_folder = '/data/wenhao/projects/xtuner/data'
383
+ size = 378
384
+ strategy = dict(
385
+ config=dict(
386
+ bf16=dict(enabled=True),
387
+ fp16=dict(enabled=False, initial_scale_power=16),
388
+ gradient_accumulation_steps='auto',
389
+ gradient_clipping='auto',
390
+ train_micro_batch_size_per_gpu='auto',
391
+ zero_allow_untested_optimizer=True,
392
+ zero_force_ds_cpu_optimizer=False,
393
+ zero_optimization=dict(overlap_comm=True, stage=2)),
394
+ exclude_frozen_parameters=True,
395
+ gradient_accumulation_steps=96,
396
+ gradient_clipping=1,
397
+ sequence_parallel_size=1,
398
+ train_micro_batch_size_per_gpu=1,
399
+ type='xtuner.engine.DeepSpeedStrategy')
400
+ tokenizer = dict(
401
+ padding_side='right',
402
+ pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct',
403
+ trust_remote_code=True,
404
+ type='transformers.AutoTokenizer.from_pretrained')
405
+ train_cfg = dict(by_epoch=True, max_epochs=1, val_interval=1)
406
+ train_dataloader = dict(
407
+ batch_size=1,
408
+ collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'),
409
+ dataset=dict(
410
+ datasets=[
411
+ dict(
412
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
413
+ image_folder=
414
+ '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/',
415
+ image_processor=dict(
416
+ crop_size=378,
417
+ pretrained_model_name_or_path=
418
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
419
+ size=378,
420
+ trust_remote_code=True,
421
+ type='transformers.CLIPImageProcessor.from_pretrained'),
422
+ max_length=4096,
423
+ offline_processed_text_folder=
424
+ '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31',
425
+ pad_image_to_square=True,
426
+ template_map_fn=dict(
427
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
428
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
429
+ type='xtuner.dataset.CambrianDataset'),
430
+ dict(
431
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
432
+ image_folder=
433
+ '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images',
434
+ image_processor=dict(
435
+ crop_size=378,
436
+ pretrained_model_name_or_path=
437
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
438
+ size=378,
439
+ trust_remote_code=True,
440
+ type='transformers.CLIPImageProcessor.from_pretrained'),
441
+ max_length=4096,
442
+ offline_processed_text_folder=
443
+ '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31',
444
+ pad_image_to_square=True,
445
+ template_map_fn=dict(
446
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
447
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
448
+ type='xtuner.dataset.CambrianDataset'),
449
+ dict(
450
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
451
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
452
+ image_processor=dict(
453
+ crop_size=378,
454
+ pretrained_model_name_or_path=
455
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
456
+ size=378,
457
+ trust_remote_code=True,
458
+ type='transformers.CLIPImageProcessor.from_pretrained'),
459
+ max_length=4096,
460
+ offline_processed_text_folder=
461
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31',
462
+ pad_image_to_square=True,
463
+ template_map_fn=dict(
464
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
465
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
466
+ type='xtuner.dataset.CambrianDataset'),
467
+ dict(
468
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
469
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
470
+ image_processor=dict(
471
+ crop_size=378,
472
+ pretrained_model_name_or_path=
473
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
474
+ size=378,
475
+ trust_remote_code=True,
476
+ type='transformers.CLIPImageProcessor.from_pretrained'),
477
+ max_length=4096,
478
+ offline_processed_text_folder=
479
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31',
480
+ pad_image_to_square=True,
481
+ template_map_fn=dict(
482
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
483
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
484
+ type='xtuner.dataset.CambrianDataset'),
485
+ dict(
486
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
487
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
488
+ image_processor=dict(
489
+ crop_size=378,
490
+ pretrained_model_name_or_path=
491
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
492
+ size=378,
493
+ trust_remote_code=True,
494
+ type='transformers.CLIPImageProcessor.from_pretrained'),
495
+ max_length=4096,
496
+ offline_processed_text_folder=
497
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31',
498
+ pad_image_to_square=True,
499
+ template_map_fn=dict(
500
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
501
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
502
+ type='xtuner.dataset.CambrianDataset'),
503
+ dict(
504
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
505
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
506
+ image_processor=dict(
507
+ crop_size=378,
508
+ pretrained_model_name_or_path=
509
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
510
+ size=378,
511
+ trust_remote_code=True,
512
+ type='transformers.CLIPImageProcessor.from_pretrained'),
513
+ max_length=4096,
514
+ offline_processed_text_folder=
515
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31',
516
+ pad_image_to_square=True,
517
+ template_map_fn=dict(
518
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
519
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
520
+ type='xtuner.dataset.CambrianDataset'),
521
+ dict(
522
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
523
+ image_folder=
524
+ '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images',
525
+ image_processor=dict(
526
+ crop_size=378,
527
+ pretrained_model_name_or_path=
528
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
529
+ size=378,
530
+ trust_remote_code=True,
531
+ type='transformers.CLIPImageProcessor.from_pretrained'),
532
+ max_length=4096,
533
+ offline_processed_text_folder=
534
+ '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31',
535
+ pad_image_to_square=True,
536
+ template_map_fn=dict(
537
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
538
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
539
+ type='xtuner.dataset.CambrianDataset'),
540
+ dict(
541
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
542
+ image_folder=
543
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images',
544
+ image_processor=dict(
545
+ crop_size=378,
546
+ pretrained_model_name_or_path=
547
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
548
+ size=378,
549
+ trust_remote_code=True,
550
+ type='transformers.CLIPImageProcessor.from_pretrained'),
551
+ max_length=4096,
552
+ offline_processed_text_folder=
553
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31',
554
+ pad_image_to_square=True,
555
+ template_map_fn=dict(
556
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
557
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
558
+ type='xtuner.dataset.CambrianDataset'),
559
+ dict(
560
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
561
+ image_folder='/data/wenhao/projects/xtuner/data',
562
+ image_processor=dict(
563
+ crop_size=378,
564
+ pretrained_model_name_or_path=
565
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
566
+ size=378,
567
+ trust_remote_code=True,
568
+ type='transformers.CLIPImageProcessor.from_pretrained'),
569
+ max_length=4096,
570
+ offline_processed_text_folder=
571
+ '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31',
572
+ pad_image_to_square=True,
573
+ template_map_fn=dict(
574
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
575
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
576
+ type='xtuner.dataset.CambrianDataset'),
577
+ dict(
578
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
579
+ image_folder=
580
+ '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data',
581
+ image_processor=dict(
582
+ crop_size=378,
583
+ pretrained_model_name_or_path=
584
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
585
+ size=378,
586
+ trust_remote_code=True,
587
+ type='transformers.CLIPImageProcessor.from_pretrained'),
588
+ max_length=4096,
589
+ offline_processed_text_folder=
590
+ '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3',
591
+ pad_image_to_square=True,
592
+ template_map_fn=dict(
593
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
594
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
595
+ type='xtuner.dataset.CambrianDataset'),
596
+ ],
597
+ type='xtuner.dataset.ConcatDataset'),
598
+ num_workers=0,
599
+ sampler=dict(shuffle=True, type='mmengine.dataset.DefaultSampler'))
600
+ train_dataset = dict(
601
+ datasets=[
602
+ dict(
603
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
604
+ image_folder='/data/wenhao/projects/xtuner/data/M4-Instruct-Data/',
605
+ image_processor=dict(
606
+ crop_size=378,
607
+ pretrained_model_name_or_path=
608
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
609
+ size=378,
610
+ trust_remote_code=True,
611
+ type='transformers.CLIPImageProcessor.from_pretrained'),
612
+ max_length=4096,
613
+ offline_processed_text_folder=
614
+ '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31',
615
+ pad_image_to_square=True,
616
+ template_map_fn=dict(
617
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
618
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
619
+ type='xtuner.dataset.CambrianDataset'),
620
+ dict(
621
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
622
+ image_folder=
623
+ '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images',
624
+ image_processor=dict(
625
+ crop_size=378,
626
+ pretrained_model_name_or_path=
627
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
628
+ size=378,
629
+ trust_remote_code=True,
630
+ type='transformers.CLIPImageProcessor.from_pretrained'),
631
+ max_length=4096,
632
+ offline_processed_text_folder=
633
+ '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31',
634
+ pad_image_to_square=True,
635
+ template_map_fn=dict(
636
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
637
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
638
+ type='xtuner.dataset.CambrianDataset'),
639
+ dict(
640
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
641
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
642
+ image_processor=dict(
643
+ crop_size=378,
644
+ pretrained_model_name_or_path=
645
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
646
+ size=378,
647
+ trust_remote_code=True,
648
+ type='transformers.CLIPImageProcessor.from_pretrained'),
649
+ max_length=4096,
650
+ offline_processed_text_folder=
651
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31',
652
+ pad_image_to_square=True,
653
+ template_map_fn=dict(
654
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
655
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
656
+ type='xtuner.dataset.CambrianDataset'),
657
+ dict(
658
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
659
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
660
+ image_processor=dict(
661
+ crop_size=378,
662
+ pretrained_model_name_or_path=
663
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
664
+ size=378,
665
+ trust_remote_code=True,
666
+ type='transformers.CLIPImageProcessor.from_pretrained'),
667
+ max_length=4096,
668
+ offline_processed_text_folder=
669
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31',
670
+ pad_image_to_square=True,
671
+ template_map_fn=dict(
672
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
673
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
674
+ type='xtuner.dataset.CambrianDataset'),
675
+ dict(
676
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
677
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
678
+ image_processor=dict(
679
+ crop_size=378,
680
+ pretrained_model_name_or_path=
681
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
682
+ size=378,
683
+ trust_remote_code=True,
684
+ type='transformers.CLIPImageProcessor.from_pretrained'),
685
+ max_length=4096,
686
+ offline_processed_text_folder=
687
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31',
688
+ pad_image_to_square=True,
689
+ template_map_fn=dict(
690
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
691
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
692
+ type='xtuner.dataset.CambrianDataset'),
693
+ dict(
694
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
695
+ image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V',
696
+ image_processor=dict(
697
+ crop_size=378,
698
+ pretrained_model_name_or_path=
699
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
700
+ size=378,
701
+ trust_remote_code=True,
702
+ type='transformers.CLIPImageProcessor.from_pretrained'),
703
+ max_length=4096,
704
+ offline_processed_text_folder=
705
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31',
706
+ pad_image_to_square=True,
707
+ template_map_fn=dict(
708
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
709
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
710
+ type='xtuner.dataset.CambrianDataset'),
711
+ dict(
712
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
713
+ image_folder=
714
+ '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images',
715
+ image_processor=dict(
716
+ crop_size=378,
717
+ pretrained_model_name_or_path=
718
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
719
+ size=378,
720
+ trust_remote_code=True,
721
+ type='transformers.CLIPImageProcessor.from_pretrained'),
722
+ max_length=4096,
723
+ offline_processed_text_folder=
724
+ '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31',
725
+ pad_image_to_square=True,
726
+ template_map_fn=dict(
727
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
728
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
729
+ type='xtuner.dataset.CambrianDataset'),
730
+ dict(
731
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
732
+ image_folder=
733
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images',
734
+ image_processor=dict(
735
+ crop_size=378,
736
+ pretrained_model_name_or_path=
737
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
738
+ size=378,
739
+ trust_remote_code=True,
740
+ type='transformers.CLIPImageProcessor.from_pretrained'),
741
+ max_length=4096,
742
+ offline_processed_text_folder=
743
+ '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31',
744
+ pad_image_to_square=True,
745
+ template_map_fn=dict(
746
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
747
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
748
+ type='xtuner.dataset.CambrianDataset'),
749
+ dict(
750
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
751
+ image_folder='/data/wenhao/projects/xtuner/data',
752
+ image_processor=dict(
753
+ crop_size=378,
754
+ pretrained_model_name_or_path=
755
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
756
+ size=378,
757
+ trust_remote_code=True,
758
+ type='transformers.CLIPImageProcessor.from_pretrained'),
759
+ max_length=4096,
760
+ offline_processed_text_folder=
761
+ '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31',
762
+ pad_image_to_square=True,
763
+ template_map_fn=dict(
764
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
765
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
766
+ type='xtuner.dataset.CambrianDataset'),
767
+ dict(
768
+ dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn',
769
+ image_folder=
770
+ '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data',
771
+ image_processor=dict(
772
+ crop_size=378,
773
+ pretrained_model_name_or_path=
774
+ 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
775
+ size=378,
776
+ trust_remote_code=True,
777
+ type='transformers.CLIPImageProcessor.from_pretrained'),
778
+ max_length=4096,
779
+ offline_processed_text_folder=
780
+ '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3',
781
+ pad_image_to_square=True,
782
+ template_map_fn=dict(
783
+ template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat',
784
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
785
+ type='xtuner.dataset.CambrianDataset'),
786
+ ],
787
+ type='xtuner.dataset.ConcatDataset')
788
+ visual_encoder_name_or_path = '/data/wenhao/projects/xtuner/work_dirs/final_new_v/visual_encoder'
789
+ visual_token_merge_ratio = 0.1
790
+ visualizer = None
791
+ warmup_ratio = 0.03
792
+ weight_decay = 0
793
+ work_dir = 'work_dirs/new_image'