NoMoreCopyrightOrg
/

flux-test

English

Inference Endpoints

Model card Files Files and versions Community

John6666 commited on 5 days ago

Commit

f32b8e8

verified ·

1 Parent(s): 7e7c650

Upload handler.py

Browse files

Files changed (1) hide show

handler.py +12 -7

handler.py CHANGED Viewed

@@ -6,11 +6,12 @@ from typing import Any, Dict
 from diffusers import FluxPipeline, FluxTransformer2DModel, AutoencoderKL, TorchAoConfig
 from PIL import Image
 import torch
-from torchao.quantization import quantize_, autoquant, int8_dynamic_activation_int8_weight
 from huggingface_hub import hf_hub_download
 IS_COMPILE = False
 IS_TURBO = False
 if IS_COMPILE:
     import torch._dynamo
@@ -19,7 +20,7 @@ if IS_COMPILE:
 from huggingface_inference_toolkit.logging import logger
 def load_pipeline_stable(repo_id: str, dtype: torch.dtype) -> Any:
-    quantization_config = TorchAoConfig("int8dq")
     vae = AutoencoderKL.from_pretrained(repo_id, subfolder="vae", torch_dtype=dtype)
     pipe = FluxPipeline.from_pretrained(repo_id, vae=vae, torch_dtype=dtype, quantization_config=quantization_config)
     pipe.transformer.fuse_qkv_projections()
@@ -28,7 +29,7 @@ def load_pipeline_stable(repo_id: str, dtype: torch.dtype) -> Any:
     return pipe
 def load_pipeline_compile(repo_id: str, dtype: torch.dtype) -> Any:
-    quantization_config = TorchAoConfig("int8dq")
     vae = AutoencoderKL.from_pretrained(repo_id, subfolder="vae", torch_dtype=dtype)
     pipe = FluxPipeline.from_pretrained(repo_id, vae=vae, torch_dtype=dtype, quantization_config=quantization_config)
     pipe.transformer.fuse_qkv_projections()
@@ -60,8 +61,10 @@ def load_pipeline_turbo(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.fuse_lora()
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
-    quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
-    quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
     pipe.to("cuda")
     return pipe
@@ -72,8 +75,10 @@ def load_pipeline_turbo_compile(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.fuse_lora()
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
-    quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
-    quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
     pipe.transformer.to(memory_format=torch.channels_last)
     pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.vae.to(memory_format=torch.channels_last)

 from diffusers import FluxPipeline, FluxTransformer2DModel, AutoencoderKL, TorchAoConfig
 from PIL import Image
 import torch
+from torchao.quantization import quantize_, autoquant, int8_dynamic_activation_int8_weight, int8_dynamic_activation_int4_weight
 from huggingface_hub import hf_hub_download
 IS_COMPILE = False
 IS_TURBO = False
+IS_4BIT = True
 if IS_COMPILE:
     import torch._dynamo
 from huggingface_inference_toolkit.logging import logger
 def load_pipeline_stable(repo_id: str, dtype: torch.dtype) -> Any:
+    quantization_config = TorchAoConfig("int4dq" if IS_4BIT else "int8dq")
     vae = AutoencoderKL.from_pretrained(repo_id, subfolder="vae", torch_dtype=dtype)
     pipe = FluxPipeline.from_pretrained(repo_id, vae=vae, torch_dtype=dtype, quantization_config=quantization_config)
     pipe.transformer.fuse_qkv_projections()
     return pipe
 def load_pipeline_compile(repo_id: str, dtype: torch.dtype) -> Any:
+    quantization_config = TorchAoConfig("int4dq" if IS_4BIT else "int8dq")
     vae = AutoencoderKL.from_pretrained(repo_id, subfolder="vae", torch_dtype=dtype)
     pipe = FluxPipeline.from_pretrained(repo_id, vae=vae, torch_dtype=dtype, quantization_config=quantization_config)
     pipe.transformer.fuse_qkv_projections()
     pipe.fuse_lora()
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
+    weight = int8_dynamic_activation_int4_weight() if IS_4BIT else int8_dynamic_activation_int8_weight()
+    quantize_(pipe.transformer, weight, device="cuda")
+    quantize_(pipe.vae, weight, device="cuda")
+    quantize_(pipe.text_encoder_2, weight, device="cuda")
     pipe.to("cuda")
     return pipe
     pipe.fuse_lora()
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
+    weight = int8_dynamic_activation_int4_weight() if IS_4BIT else int8_dynamic_activation_int8_weight()
+    quantize_(pipe.transformer, weight, device="cuda")
+    quantize_(pipe.vae, weight, device="cuda")
+    quantize_(pipe.text_encoder_2, weight, device="cuda")
     pipe.transformer.to(memory_format=torch.channels_last)
     pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.vae.to(memory_format=torch.channels_last)