bluelike commited on
Commit
5e2938b
1 Parent(s): a623fe7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +241 -65
README.md CHANGED
@@ -89,17 +89,18 @@ pip install qwen-vl-utils
89
  Here we show a code snippet to show you how to use the chat model with `transformers` and `qwen_vl_utils`:
90
 
91
  ```python
92
-
93
  from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
94
  from qwen_vl_utils import process_vision_info
95
 
96
  # default: Load the model on the available device(s)
97
- model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="auto")
 
 
98
 
99
  # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
100
  # model = Qwen2VLForConditionalGeneration.from_pretrained(
101
- # "Qwen/Qwen2-VL-2B-Instruct",
102
- # torch_dtype=torch.bfloat16,
103
  # attn_implementation="flash_attention_2",
104
  # device_map="auto",
105
  # )
@@ -112,27 +113,47 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
112
  # max_pixels = 1280*28*28
113
  # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
114
 
115
- # default processer
116
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
117
-
118
- messages = [{"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
119
 
120
  # Preparation for inference
121
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
122
  image_inputs, video_inputs = process_vision_info(messages)
123
- inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
 
 
 
 
 
 
124
 
125
  # Inference: Generation of the output
126
  generated_ids = model.generate(**inputs, max_new_tokens=128)
127
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
128
- output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 
 
 
129
  print(output_text)
130
  ```
131
  <details>
132
  <summary>Without qwen_vl_utils</summary>
133
 
134
  ```python
135
-
136
  from PIL import Image
137
  import requests
138
  import torch
@@ -141,7 +162,9 @@ from typing import Dict
141
  from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
142
 
143
  # Load the model in half-precision on the available device(s)
144
- model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="auto")
 
 
145
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
146
 
147
  # Image
@@ -150,16 +173,13 @@ image = Image.open(requests.get(url, stream=True).raw)
150
 
151
  conversation = [
152
  {
153
- "role":"user",
154
- "content":[
155
  {
156
- "type":"image",
157
  },
158
- {
159
- "type":"text",
160
- "text":"Describe this image."
161
- }
162
- ]
163
  }
164
  ]
165
 
@@ -168,13 +188,20 @@ conversation = [
168
  text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
169
  # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
170
 
171
- inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
172
- inputs = inputs.to('cuda')
 
 
173
 
174
  # Inference: Generation of the output
175
  output_ids = model.generate(**inputs, max_new_tokens=128)
176
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
177
- output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
 
 
 
 
178
  print(output_text)
179
  ```
180
  </details>
@@ -184,17 +211,39 @@ print(output_text)
184
 
185
  ```python
186
  # Messages containing multiple images and a text query
187
- messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "Identify the similarities between these images."}]}]
 
 
 
 
 
 
 
 
 
188
 
189
  # Preparation for inference
190
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
191
  image_inputs, video_inputs = process_vision_info(messages)
192
- inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
 
 
 
 
 
 
193
 
194
  # Inference
195
  generated_ids = model.generate(**inputs, max_new_tokens=128)
196
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
197
- output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 
 
 
198
  print(output_text)
199
  ```
200
  </details>
@@ -203,21 +252,63 @@ print(output_text)
203
  <summary>Video inference</summary>
204
 
205
  ```python
206
-
207
  # Messages containing a images list as a video and a text query
208
- messages = [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg"], 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  # Messages containing a video and a text query
210
- messages = [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  # Preparation for inference
213
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
214
  image_inputs, video_inputs = process_vision_info(messages)
215
- inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
 
 
 
 
 
 
216
 
217
  # Inference
218
  generated_ids = model.generate(**inputs, max_new_tokens=128)
219
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
220
- output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 
 
 
221
  print(output_text)
222
  ```
223
  </details>
@@ -226,22 +317,47 @@ print(output_text)
226
  <summary>Batch inference</summary>
227
 
228
  ```python
229
-
230
  # Sample messages for batch inference
231
- messages1 = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "What are the common elements in these pictures?"}]}]
232
- messages2 = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]
 
 
 
 
 
 
 
 
 
 
 
 
233
  # Combine messages for batch processing
234
  messages = [messages1, messages1]
235
 
236
  # Preparation for batch inference
237
- texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
 
 
 
238
  image_inputs, video_inputs = process_vision_info(messages)
239
- inputs = processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
 
 
 
 
 
 
 
240
 
241
  # Batch Inference
242
  generated_ids = model.generate(**inputs, max_new_tokens=128)
243
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
244
- output_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 
 
 
245
  print(output_texts)
246
  ```
247
  </details>
@@ -253,22 +369,46 @@ For input images, we support local files, base64, and URLs. For videos, we curre
253
  ```python
254
  # You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
255
  ## Local file path
256
- messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
257
  ## Image URL
258
- messages = [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
259
  ## Base64 encoded image
260
- messages = [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
261
  ```
262
  #### Image Resolution for performance boost
263
 
264
  The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage.
265
 
266
  ```python
267
-
268
- min_pixels = 256*28*28
269
- max_pixels = 1280*28*28
270
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
271
-
272
  ```
273
 
274
  Besides, We provide two methods for fine-grained control over the image size input to the model:
@@ -279,21 +419,49 @@ Besides, We provide two methods for fine-grained control over the image size inp
279
 
280
  ```python
281
  # min_pixels and max_pixels
282
- messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # resized_height and resized_width
284
- messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "min_pixels": 50176, "max_pixels": 50176}, {"type": "text", "text": "Describe this image."}]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  ```
286
 
287
- **Limitations:**
288
 
289
- 1. Does not support audio extraction from videos.
290
- 2. Limited to data available up until June 2023.
291
- 3. Limited coverage of character/IP recognition.
292
- 4. Complex instruction following capabilities need enhancement.
293
- 5. Counting abilities, particularly in complex scenarios, require improvement.
294
- 6. Handling of complex charts by the model still needs refinement.
295
- 7. The model performs poorly in spatial relationship reasoning, especially in reasoning about object positions in a 3D space.
296
 
 
 
 
 
 
 
 
 
297
 
298
 
299
  ## Citation
@@ -301,8 +469,16 @@ messages = [{"role": "user", "content": [{"type": "image", "image": "file:///pat
301
  If you find our work helpful, feel free to give us a cite.
302
 
303
  ```
304
- @article{qwen2vl,
305
- title={Qwen2-VL Technical Report},
 
306
  year={2024}
307
  }
 
 
 
 
 
 
 
308
  ```
 
89
  Here we show a code snippet to show you how to use the chat model with `transformers` and `qwen_vl_utils`:
90
 
91
  ```python
 
92
  from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
93
  from qwen_vl_utils import process_vision_info
94
 
95
  # default: Load the model on the available device(s)
96
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
97
+ "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
98
+ )
99
 
100
  # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
101
  # model = Qwen2VLForConditionalGeneration.from_pretrained(
102
+ # "Qwen/Qwen2-VL-2B-Instruct",
103
+ # torch_dtype=torch.bfloat16,
104
  # attn_implementation="flash_attention_2",
105
  # device_map="auto",
106
  # )
 
113
  # max_pixels = 1280*28*28
114
  # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
115
 
116
+ messages = [
117
+ {
118
+ "role": "user",
119
+ "content": [
120
+ {
121
+ "type": "image",
122
+ "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
123
+ },
124
+ {"type": "text", "text": "Describe this image."},
125
+ ],
126
+ }
127
+ ]
128
 
129
  # Preparation for inference
130
+ text = processor.apply_chat_template(
131
+ messages, tokenize=False, add_generation_prompt=True
132
+ )
133
  image_inputs, video_inputs = process_vision_info(messages)
134
+ inputs = processor(
135
+ text=[text],
136
+ images=image_inputs,
137
+ videos=video_inputs,
138
+ padding=True,
139
+ return_tensors="pt",
140
+ )
141
+ inputs = inputs.to("cuda")
142
 
143
  # Inference: Generation of the output
144
  generated_ids = model.generate(**inputs, max_new_tokens=128)
145
+ generated_ids_trimmed = [
146
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
147
+ ]
148
+ output_text = processor.batch_decode(
149
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
150
+ )
151
  print(output_text)
152
  ```
153
  <details>
154
  <summary>Without qwen_vl_utils</summary>
155
 
156
  ```python
 
157
  from PIL import Image
158
  import requests
159
  import torch
 
162
  from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
163
 
164
  # Load the model in half-precision on the available device(s)
165
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
166
+ "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
167
+ )
168
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
169
 
170
  # Image
 
173
 
174
  conversation = [
175
  {
176
+ "role": "user",
177
+ "content": [
178
  {
179
+ "type": "image",
180
  },
181
+ {"type": "text", "text": "Describe this image."},
182
+ ],
 
 
 
183
  }
184
  ]
185
 
 
188
  text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
189
  # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
190
 
191
+ inputs = processor(
192
+ text=[text_prompt], images=[image], padding=True, return_tensors="pt"
193
+ )
194
+ inputs = inputs.to("cuda")
195
 
196
  # Inference: Generation of the output
197
  output_ids = model.generate(**inputs, max_new_tokens=128)
198
+ generated_ids = [
199
+ output_ids[len(input_ids) :]
200
+ for input_ids, output_ids in zip(inputs.input_ids, output_ids)
201
+ ]
202
+ output_text = processor.batch_decode(
203
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
204
+ )
205
  print(output_text)
206
  ```
207
  </details>
 
211
 
212
  ```python
213
  # Messages containing multiple images and a text query
214
+ messages = [
215
+ {
216
+ "role": "user",
217
+ "content": [
218
+ {"type": "image", "image": "file:///path/to/image1.jpg"},
219
+ {"type": "image", "image": "file:///path/to/image2.jpg"},
220
+ {"type": "text", "text": "Identify the similarities between these images."},
221
+ ],
222
+ }
223
+ ]
224
 
225
  # Preparation for inference
226
+ text = processor.apply_chat_template(
227
+ messages, tokenize=False, add_generation_prompt=True
228
+ )
229
  image_inputs, video_inputs = process_vision_info(messages)
230
+ inputs = processor(
231
+ text=[text],
232
+ images=image_inputs,
233
+ videos=video_inputs,
234
+ padding=True,
235
+ return_tensors="pt",
236
+ )
237
+ inputs = inputs.to("cuda")
238
 
239
  # Inference
240
  generated_ids = model.generate(**inputs, max_new_tokens=128)
241
+ generated_ids_trimmed = [
242
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
243
+ ]
244
+ output_text = processor.batch_decode(
245
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
246
+ )
247
  print(output_text)
248
  ```
249
  </details>
 
252
  <summary>Video inference</summary>
253
 
254
  ```python
 
255
  # Messages containing a images list as a video and a text query
256
+ messages = [
257
+ {
258
+ "role": "user",
259
+ "content": [
260
+ {
261
+ "type": "video",
262
+ "video": [
263
+ "file:///path/to/frame1.jpg",
264
+ "file:///path/to/frame2.jpg",
265
+ "file:///path/to/frame3.jpg",
266
+ "file:///path/to/frame4.jpg",
267
+ ],
268
+ "fps": 1.0,
269
+ },
270
+ {"type": "text", "text": "Describe this video."},
271
+ ],
272
+ }
273
+ ]
274
  # Messages containing a video and a text query
275
+ messages = [
276
+ {
277
+ "role": "user",
278
+ "content": [
279
+ {
280
+ "type": "video",
281
+ "video": "file:///path/to/video1.mp4",
282
+ "max_pixels": 360 * 420,
283
+ "fps": 1.0,
284
+ },
285
+ {"type": "text", "text": "Describe this video."},
286
+ ],
287
+ }
288
+ ]
289
 
290
  # Preparation for inference
291
+ text = processor.apply_chat_template(
292
+ messages, tokenize=False, add_generation_prompt=True
293
+ )
294
  image_inputs, video_inputs = process_vision_info(messages)
295
+ inputs = processor(
296
+ text=[text],
297
+ images=image_inputs,
298
+ videos=video_inputs,
299
+ padding=True,
300
+ return_tensors="pt",
301
+ )
302
+ inputs = inputs.to("cuda")
303
 
304
  # Inference
305
  generated_ids = model.generate(**inputs, max_new_tokens=128)
306
+ generated_ids_trimmed = [
307
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
308
+ ]
309
+ output_text = processor.batch_decode(
310
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
311
+ )
312
  print(output_text)
313
  ```
314
  </details>
 
317
  <summary>Batch inference</summary>
318
 
319
  ```python
 
320
  # Sample messages for batch inference
321
+ messages1 = [
322
+ {
323
+ "role": "user",
324
+ "content": [
325
+ {"type": "image", "image": "file:///path/to/image1.jpg"},
326
+ {"type": "image", "image": "file:///path/to/image2.jpg"},
327
+ {"type": "text", "text": "What are the common elements in these pictures?"},
328
+ ],
329
+ }
330
+ ]
331
+ messages2 = [
332
+ {"role": "system", "content": "You are a helpful assistant."},
333
+ {"role": "user", "content": "Who are you?"},
334
+ ]
335
  # Combine messages for batch processing
336
  messages = [messages1, messages1]
337
 
338
  # Preparation for batch inference
339
+ texts = [
340
+ processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
341
+ for msg in messages
342
+ ]
343
  image_inputs, video_inputs = process_vision_info(messages)
344
+ inputs = processor(
345
+ text=texts,
346
+ images=image_inputs,
347
+ videos=video_inputs,
348
+ padding=True,
349
+ return_tensors="pt",
350
+ )
351
+ inputs = inputs.to("cuda")
352
 
353
  # Batch Inference
354
  generated_ids = model.generate(**inputs, max_new_tokens=128)
355
+ generated_ids_trimmed = [
356
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
357
+ ]
358
+ output_texts = processor.batch_decode(
359
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
360
+ )
361
  print(output_texts)
362
  ```
363
  </details>
 
369
  ```python
370
  # You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
371
  ## Local file path
372
+ messages = [
373
+ {
374
+ "role": "user",
375
+ "content": [
376
+ {"type": "image", "image": "file:///path/to/your/image.jpg"},
377
+ {"type": "text", "text": "Describe this image."},
378
+ ],
379
+ }
380
+ ]
381
  ## Image URL
382
+ messages = [
383
+ {
384
+ "role": "user",
385
+ "content": [
386
+ {"type": "image", "image": "http://path/to/your/image.jpg"},
387
+ {"type": "text", "text": "Describe this image."},
388
+ ],
389
+ }
390
+ ]
391
  ## Base64 encoded image
392
+ messages = [
393
+ {
394
+ "role": "user",
395
+ "content": [
396
+ {"type": "image", "image": "data:image;base64,/9j/..."},
397
+ {"type": "text", "text": "Describe this image."},
398
+ ],
399
+ }
400
+ ]
401
  ```
402
  #### Image Resolution for performance boost
403
 
404
  The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage.
405
 
406
  ```python
407
+ min_pixels = 256 * 28 * 28
408
+ max_pixels = 1280 * 28 * 28
409
+ processor = AutoProcessor.from_pretrained(
410
+ "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
411
+ )
412
  ```
413
 
414
  Besides, We provide two methods for fine-grained control over the image size input to the model:
 
419
 
420
  ```python
421
  # min_pixels and max_pixels
422
+ messages = [
423
+ {
424
+ "role": "user",
425
+ "content": [
426
+ {
427
+ "type": "image",
428
+ "image": "file:///path/to/your/image.jpg",
429
+ "resized_height": 280,
430
+ "resized_width": 420,
431
+ },
432
+ {"type": "text", "text": "Describe this image."},
433
+ ],
434
+ }
435
+ ]
436
  # resized_height and resized_width
437
+ messages = [
438
+ {
439
+ "role": "user",
440
+ "content": [
441
+ {
442
+ "type": "image",
443
+ "image": "file:///path/to/your/image.jpg",
444
+ "min_pixels": 50176,
445
+ "max_pixels": 50176,
446
+ },
447
+ {"type": "text", "text": "Describe this image."},
448
+ ],
449
+ }
450
+ ]
451
  ```
452
 
453
+ ## Limitations
454
 
455
+ While Qwen2-VL are applicable to a wide range of visual tasks, it is equally important to understand its limitations. Here are some known restrictions:
 
 
 
 
 
 
456
 
457
+ 1. Lack of Audio Support: The current model does **not comprehend audio information** within videos.
458
+ 2. Data timeliness: Our image dataset is **updated until June 2023**, and information subsequent to this date may not be covered.
459
+ 3. Constraints in Individuals and Intellectual Property (IP): The model's capacity to recognize specific individuals or IPs is limited, potentially failing to comprehensively cover all well-known personalities or brands.
460
+ 4. Limited Capacity for Complex Instruction: When faced with intricate multi-step instructions, the model's understanding and execution capabilities require enhancement.
461
+ 5. Insufficient Counting Accuracy: Particularly in complex scenes, the accuracy of object counting is not high, necessitating further improvements.
462
+ 6. Weak Spatial Reasoning Skills: Especially in 3D spaces, the model's inference of object positional relationships is inadequate, making it difficult to precisely judge the relative positions of objects.
463
+
464
+ These limitations serve as ongoing directions for model optimization and improvement, and we are committed to continually enhancing the model's performance and scope of application.
465
 
466
 
467
  ## Citation
 
469
  If you find our work helpful, feel free to give us a cite.
470
 
471
  ```
472
+ @article{Qwen2-VL,
473
+ title={Qwen2-VL},
474
+ author={Qwen team},
475
  year={2024}
476
  }
477
+
478
+ @article{Qwen-VL,
479
+ title={Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond},
480
+ author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},
481
+ journal={arXiv preprint arXiv:2308.12966},
482
+ year={2023}
483
+ }
484
  ```