rag_ColPali_Qwen2VL

Running on Zero

AdrienB134 commited on Sep 5

Commit

aa5f27d

•

1 Parent(s): 1c39bcd

flash_attn

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ import time
 from PIL import Image
 import torch
 import subprocess
-#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -39,7 +39,7 @@ def model_inference(
     # print(type(images))
     images = [{"type": "image", "image": Image.open(image[0])} for image in images]
     images.append({"type": "text", "text": text})
-    print(images)
     # model = Qwen2VLForConditionalGeneration.from_pretrained(
     # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
     # )
@@ -47,12 +47,14 @@ def model_inference(
     #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
-        #attn_implementation="flash_attention_2", #doesn't work on zerogpu WTF?!
         trust_remote_code=True,
         torch_dtype="auto").cuda().eval()
     # default processer
-    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
     # min_pixels = 256*28*28

 from PIL import Image
 import torch
 import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
     # print(type(images))
     images = [{"type": "image", "image": Image.open(image[0])} for image in images]
     images.append({"type": "text", "text": text})
     # model = Qwen2VLForConditionalGeneration.from_pretrained(
     # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
     # )
     #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
+        attn_implementation="flash_attention_2", #doesn't work on zerogpu WTF?!
         trust_remote_code=True,
         torch_dtype="auto").cuda().eval()
     # default processer
+    min_pixels = 256*28*28
+    max_pixels = 1280*28*28
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
     # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
     # min_pixels = 256*28*28