AdrienB134 commited on
Commit
aa5f27d
1 Parent(s): 1c39bcd

flash_attn

Browse files
Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -20,7 +20,7 @@ import time
20
  from PIL import Image
21
  import torch
22
  import subprocess
23
- #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
24
 
25
 
26
 
@@ -39,7 +39,7 @@ def model_inference(
39
  # print(type(images))
40
  images = [{"type": "image", "image": Image.open(image[0])} for image in images]
41
  images.append({"type": "text", "text": text})
42
- print(images)
43
  # model = Qwen2VLForConditionalGeneration.from_pretrained(
44
  # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
45
  # )
@@ -47,12 +47,14 @@ def model_inference(
47
  #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
48
  model = Qwen2VLForConditionalGeneration.from_pretrained(
49
  "Qwen/Qwen2-VL-2B-Instruct",
50
- #attn_implementation="flash_attention_2", #doesn't work on zerogpu WTF?!
51
  trust_remote_code=True,
52
  torch_dtype="auto").cuda().eval()
53
 
54
  # default processer
55
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
 
 
56
 
57
  # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
58
  # min_pixels = 256*28*28
 
20
  from PIL import Image
21
  import torch
22
  import subprocess
23
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
24
 
25
 
26
 
 
39
  # print(type(images))
40
  images = [{"type": "image", "image": Image.open(image[0])} for image in images]
41
  images.append({"type": "text", "text": text})
42
+
43
  # model = Qwen2VLForConditionalGeneration.from_pretrained(
44
  # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
45
  # )
 
47
  #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
48
  model = Qwen2VLForConditionalGeneration.from_pretrained(
49
  "Qwen/Qwen2-VL-2B-Instruct",
50
+ attn_implementation="flash_attention_2", #doesn't work on zerogpu WTF?!
51
  trust_remote_code=True,
52
  torch_dtype="auto").cuda().eval()
53
 
54
  # default processer
55
+ min_pixels = 256*28*28
56
+ max_pixels = 1280*28*28
57
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
58
 
59
  # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
60
  # min_pixels = 256*28*28