English
Inference Endpoints
John6666 commited on
Commit
64d8a9c
·
verified ·
1 Parent(s): 76a5f56

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +6 -6
  2. requirements.txt +1 -2
handler.py CHANGED
@@ -34,9 +34,9 @@ def load_pipeline_compile(repo_id: str, dtype: torch.dtype) -> Any:
34
  pipe.transformer.fuse_qkv_projections()
35
  pipe.vae.fuse_qkv_projections()
36
  pipe.transformer.to(memory_format=torch.channels_last)
37
- pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
38
  pipe.vae.to(memory_format=torch.channels_last)
39
- pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
40
  pipe.to("cuda")
41
  return pipe
42
 
@@ -45,9 +45,9 @@ def load_pipeline_autoquant(repo_id: str, dtype: torch.dtype) -> Any:
45
  pipe.transformer.fuse_qkv_projections()
46
  pipe.vae.fuse_qkv_projections()
47
  pipe.transformer.to(memory_format=torch.channels_last)
48
- pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True, backend="eager")
49
  pipe.vae.to(memory_format=torch.channels_last)
50
- pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True, backend="eager")
51
  pipe.transformer = autoquant(pipe.transformer, error_on_unseen=False)
52
  pipe.vae = autoquant(pipe.vae, error_on_unseen=False)
53
  pipe.to("cuda")
@@ -75,9 +75,9 @@ def load_pipeline_turbo_compile(repo_id: str, dtype: torch.dtype) -> Any:
75
  quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
76
  quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
77
  pipe.transformer.to(memory_format=torch.channels_last)
78
- pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
79
  pipe.vae.to(memory_format=torch.channels_last)
80
- pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False, backend="eager")
81
  pipe.to("cuda")
82
  return pipe
83
 
 
34
  pipe.transformer.fuse_qkv_projections()
35
  pipe.vae.fuse_qkv_projections()
36
  pipe.transformer.to(memory_format=torch.channels_last)
37
+ pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
38
  pipe.vae.to(memory_format=torch.channels_last)
39
+ pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False)
40
  pipe.to("cuda")
41
  return pipe
42
 
 
45
  pipe.transformer.fuse_qkv_projections()
46
  pipe.vae.fuse_qkv_projections()
47
  pipe.transformer.to(memory_format=torch.channels_last)
48
+ pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
49
  pipe.vae.to(memory_format=torch.channels_last)
50
+ pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True)
51
  pipe.transformer = autoquant(pipe.transformer, error_on_unseen=False)
52
  pipe.vae = autoquant(pipe.vae, error_on_unseen=False)
53
  pipe.to("cuda")
 
75
  quantize_(pipe.transformer, int8_dynamic_activation_int8_weight(), device="cuda")
76
  quantize_(pipe.vae, int8_dynamic_activation_int8_weight(), device="cuda")
77
  pipe.transformer.to(memory_format=torch.channels_last)
78
+ pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=False, dynamic=False)
79
  pipe.vae.to(memory_format=torch.channels_last)
80
+ pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=False, dynamic=False)
81
  pipe.to("cuda")
82
  return pipe
83
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  huggingface_hub
2
- torch==2.4.0
3
  torchvision
4
  torchao==0.9.0
5
  diffusers==0.32.2
@@ -11,5 +11,4 @@ scipy
11
  Pillow
12
  sentencepiece
13
  protobuf
14
- pytorch-lightning
15
  triton
 
1
  huggingface_hub
2
+ torch>=2.4.0
3
  torchvision
4
  torchao==0.9.0
5
  diffusers==0.32.2
 
11
  Pillow
12
  sentencepiece
13
  protobuf
 
14
  triton