NoMoreCopyrightOrg
/

flux-test

English

Inference Endpoints

Model card Files Files and versions Community

John6666 commited on 5 days ago

Commit

64d8a9c

verified ·

1 Parent(s): 76a5f56

Upload 2 files

Browse files

Files changed (2) hide show

handler.py +6 -6
requirements.txt +1 -2

handler.py CHANGED Viewed

@@ -34,9 +34,9 @@ def load_pipeline_compile(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
-    pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
     pipe.vae.to(memory_format=torch.channels_last)
-    pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
     pipe.to("cuda")
     return pipe
@@ -45,9 +45,9 @@ def load_pipeline_autoquant(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
-    pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True, backend="eager")
     pipe.vae.to(memory_format=torch.channels_last)
-    pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True, backend="eager")
     pipe.transformer = autoquant(pipe.transformer, error_on_unseen=False)
     pipe.vae = autoquant(pipe.vae, error_on_unseen=False)
     pipe.to("cuda")
@@ -75,9 +75,9 @@ def load_pipeline_turbo_compile(repo_id: str, dtype: torch.dtype) -> Any:
     quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
     quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
     pipe.transformer.to(memory_format=torch.channels_last)
-    pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
     pipe.vae.to(memory_format=torch.channels_last)
-    pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
     pipe.to("cuda")
     return pipe

     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.vae.to(memory_format=torch.channels_last)
+    pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.to("cuda")
     return pipe
     pipe.transformer.fuse_qkv_projections()
     pipe.vae.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
     pipe.vae.to(memory_format=torch.channels_last)
+    pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True)
     pipe.transformer = autoquant(pipe.transformer, error_on_unseen=False)
     pipe.vae = autoquant(pipe.vae, error_on_unseen=False)
     pipe.to("cuda")
     quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
     quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
     pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.vae.to(memory_format=torch.channels_last)
+    pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False)
     pipe.to("cuda")
     return pipe

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 huggingface_hub
-torch==2.4.0
 torchvision
 torchao==0.9.0
 diffusers==0.32.2
@@ -11,5 +11,4 @@ scipy
 Pillow
 sentencepiece
 protobuf
-pytorch-lightning
 triton

 huggingface_hub
+torch>=2.4.0
 torchvision
 torchao==0.9.0
 diffusers==0.32.2
 Pillow
 sentencepiece
 protobuf
 triton