Huiwenshi commited on
Commit
42f2cb6
·
verified ·
1 Parent(s): a251afa

Upload folder using huggingface_hub

Browse files
hunyuan3d-paint-v2-0/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hunyuan3d-paint-v2-0/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_resize": true,
7
+ "feature_extractor_type": "CLIPFeatureExtractor",
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "resample": 3,
19
+ "size": 224
20
+ }
hunyuan3d-paint-v2-0/model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.23.1",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "requires_safety_checker": false,
9
+ "safety_checker": [
10
+ null,
11
+ null
12
+ ],
13
+ "scheduler": [
14
+ "diffusers",
15
+ "DDIMScheduler"
16
+ ],
17
+ "text_encoder": [
18
+ "transformers",
19
+ "CLIPTextModel"
20
+ ],
21
+ "tokenizer": [
22
+ "transformers",
23
+ "CLIPTokenizer"
24
+ ],
25
+ "unet": [
26
+ "modules",
27
+ "UNet2p5DConditionModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
hunyuan3d-paint-v2-0/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.23.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "prediction_type": "v_prediction",
10
+ "set_alpha_to_one": true,
11
+ "steps_offset": 1,
12
+ "trained_betas": null,
13
+ "timestep_spacing": "trailing",
14
+ "rescale_betas_zero_snr": true
15
+ }
hunyuan3d-paint-v2-0/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.25.0.dev0",
24
+ "vocab_size": 49408
25
+ }
hunyuan3d-paint-v2-0/text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e254d7b61353497ea0be2c4013df4ea8f739ee88cffa0ba58cd085459ed565
3
+ size 1361671895
hunyuan3d-paint-v2-0/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
hunyuan3d-paint-v2-0/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
hunyuan3d-paint-v2-0/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "stabilityai/stable-diffusion-2",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
hunyuan3d-paint-v2-0/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
hunyuan3d-paint-v2-0/unet/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": [
6
+ 5,
7
+ 10,
8
+ 20,
9
+ 20
10
+ ],
11
+ "block_out_channels": [
12
+ 320,
13
+ 640,
14
+ 1280,
15
+ 1280
16
+ ],
17
+ "center_input_sample": false,
18
+ "cross_attention_dim": 1024,
19
+ "down_block_types": [
20
+ "CrossAttnDownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "DownBlock2D"
24
+ ],
25
+ "downsample_padding": 1,
26
+ "dual_cross_attention": false,
27
+ "flip_sin_to_cos": true,
28
+ "freq_shift": 0,
29
+ "in_channels": 4,
30
+ "layers_per_block": 2,
31
+ "mid_block_scale_factor": 1,
32
+ "norm_eps": 1e-05,
33
+ "norm_num_groups": 32,
34
+ "num_class_embeds": null,
35
+ "only_cross_attention": false,
36
+ "out_channels": 4,
37
+ "sample_size": 64,
38
+ "up_block_types": [
39
+ "UpBlock2D",
40
+ "CrossAttnUpBlock2D",
41
+ "CrossAttnUpBlock2D",
42
+ "CrossAttnUpBlock2D"
43
+ ],
44
+ "use_linear_projection": true
45
+ }
hunyuan3d-paint-v2-0/unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2efb4438e649e1ce152822dde082defe062cdd320bb9f1a27b9a4715d9c56e1a
3
+ size 3663114747
hunyuan3d-paint-v2-0/unet/modules.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, Optional
4
+ from diffusers.models import UNet2DConditionModel
5
+
6
+ import numpy
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ import torch.utils.checkpoint
11
+ import torch.distributed
12
+ from PIL import Image
13
+ from einops import rearrange
14
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
15
+
16
+ import diffusers
17
+ from diffusers import (
18
+ AutoencoderKL,
19
+ DDPMScheduler,
20
+ DiffusionPipeline,
21
+ EulerAncestralDiscreteScheduler,
22
+ UNet2DConditionModel,
23
+ ImagePipelineOutput
24
+ )
25
+ from diffusers.image_processor import VaeImageProcessor
26
+ from diffusers.models.attention_processor import Attention, AttnProcessor, XFormersAttnProcessor, AttnProcessor2_0
27
+ from diffusers.utils.import_utils import is_xformers_available
28
+
29
+
30
+ from diffusers.utils import deprecate
31
+
32
+ from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
33
+
34
+
35
+
36
+ def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
37
+ # "feed_forward_chunk_size" can be used to save memory
38
+ if hidden_states.shape[chunk_dim] % chunk_size != 0:
39
+ raise ValueError(
40
+ f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
41
+ )
42
+
43
+ num_chunks = hidden_states.shape[chunk_dim] // chunk_size
44
+ ff_output = torch.cat(
45
+ [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
46
+ dim=chunk_dim,
47
+ )
48
+ return ff_output
49
+
50
+
51
+ class Basic2p5DTransformerBlock(torch.nn.Module):
52
+ def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None:
53
+ super().__init__()
54
+ self.transformer = transformer
55
+ self.layer_name = layer_name
56
+ self.use_ma = use_ma
57
+ self.use_ra = use_ra
58
+
59
+ # multiview attn
60
+ if self.use_ma:
61
+ self.attn_multiview = Attention(
62
+ query_dim=self.dim,
63
+ heads=self.num_attention_heads,
64
+ dim_head=self.attention_head_dim,
65
+ dropout=self.dropout,
66
+ bias=self.attention_bias,
67
+ cross_attention_dim=None,
68
+ upcast_attention=self.attn1.upcast_attention,
69
+ out_bias=True,
70
+ )
71
+
72
+ # ref attn
73
+ if self.use_ra:
74
+ self.attn_refview = Attention(
75
+ query_dim=self.dim,
76
+ heads=self.num_attention_heads,
77
+ dim_head=self.attention_head_dim,
78
+ dropout=self.dropout,
79
+ bias=self.attention_bias,
80
+ cross_attention_dim=None,
81
+ upcast_attention=self.attn1.upcast_attention,
82
+ out_bias=True,
83
+ )
84
+
85
+ def __getattr__(self, name: str):
86
+ try:
87
+ return super().__getattr__(name)
88
+ except AttributeError:
89
+ return getattr(self.transformer, name)
90
+
91
+ def forward(
92
+ self,
93
+ hidden_states: torch.Tensor,
94
+ attention_mask: Optional[torch.Tensor] = None,
95
+ encoder_hidden_states: Optional[torch.Tensor] = None,
96
+ encoder_attention_mask: Optional[torch.Tensor] = None,
97
+ timestep: Optional[torch.LongTensor] = None,
98
+ cross_attention_kwargs: Dict[str, Any] = None,
99
+ class_labels: Optional[torch.LongTensor] = None,
100
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
101
+ ) -> torch.Tensor:
102
+
103
+ # Notice that normalization is always applied before the real computation in the following blocks.
104
+ # 0. Self-Attention
105
+ batch_size = hidden_states.shape[0]
106
+
107
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
108
+ num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
109
+ mode = cross_attention_kwargs.pop('mode', None)
110
+ mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
111
+ ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
112
+ condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
113
+
114
+
115
+ if self.norm_type == "ada_norm":
116
+ norm_hidden_states = self.norm1(hidden_states, timestep)
117
+ elif self.norm_type == "ada_norm_zero":
118
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
119
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
120
+ )
121
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
122
+ norm_hidden_states = self.norm1(hidden_states)
123
+ elif self.norm_type == "ada_norm_continuous":
124
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
125
+ elif self.norm_type == "ada_norm_single":
126
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
127
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
128
+ ).chunk(6, dim=1)
129
+ norm_hidden_states = self.norm1(hidden_states)
130
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
131
+ else:
132
+ raise ValueError("Incorrect norm used")
133
+
134
+ if self.pos_embed is not None:
135
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
136
+
137
+ # 1. Prepare GLIGEN inputs
138
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
139
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
140
+
141
+ attn_output = self.attn1(
142
+ norm_hidden_states,
143
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
144
+ attention_mask=attention_mask,
145
+ **cross_attention_kwargs,
146
+ )
147
+
148
+ if self.norm_type == "ada_norm_zero":
149
+ attn_output = gate_msa.unsqueeze(1) * attn_output
150
+ elif self.norm_type == "ada_norm_single":
151
+ attn_output = gate_msa * attn_output
152
+
153
+ hidden_states = attn_output + hidden_states
154
+ if hidden_states.ndim == 4:
155
+ hidden_states = hidden_states.squeeze(1)
156
+
157
+ # 1.2 Reference Attention
158
+ if 'w' in mode:
159
+ condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch) # B, (N L), C
160
+
161
+ if 'r' in mode and self.use_ra:
162
+ condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1,num_in_batch,1,1) # B N L C
163
+ condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
164
+
165
+ attn_output = self.attn_refview(
166
+ norm_hidden_states,
167
+ encoder_hidden_states=condition_embed,
168
+ attention_mask=None,
169
+ **cross_attention_kwargs
170
+ )
171
+ ref_scale_timing = ref_scale
172
+ if isinstance(ref_scale, torch.Tensor):
173
+ ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
174
+ for _ in range(attn_output.ndim - 1):
175
+ ref_scale_timing = ref_scale_timing.unsqueeze(-1)
176
+ hidden_states = ref_scale_timing * attn_output + hidden_states
177
+ if hidden_states.ndim == 4:
178
+ hidden_states = hidden_states.squeeze(1)
179
+
180
+
181
+ # 1.3 Multiview Attention
182
+ if num_in_batch > 1 and self.use_ma:
183
+ multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
184
+
185
+ attn_output = self.attn_multiview(
186
+ multivew_hidden_states,
187
+ encoder_hidden_states=multivew_hidden_states,
188
+ **cross_attention_kwargs
189
+ )
190
+
191
+ attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
192
+
193
+ hidden_states = mva_scale * attn_output + hidden_states
194
+ if hidden_states.ndim == 4:
195
+ hidden_states = hidden_states.squeeze(1)
196
+
197
+ # 1.2 GLIGEN Control
198
+ if gligen_kwargs is not None:
199
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
200
+
201
+ # 3. Cross-Attention
202
+ if self.attn2 is not None:
203
+ if self.norm_type == "ada_norm":
204
+ norm_hidden_states = self.norm2(hidden_states, timestep)
205
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
206
+ norm_hidden_states = self.norm2(hidden_states)
207
+ elif self.norm_type == "ada_norm_single":
208
+ # For PixArt norm2 isn't applied here:
209
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
210
+ norm_hidden_states = hidden_states
211
+ elif self.norm_type == "ada_norm_continuous":
212
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
213
+ else:
214
+ raise ValueError("Incorrect norm")
215
+
216
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
217
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
218
+
219
+
220
+ attn_output = self.attn2(
221
+ norm_hidden_states,
222
+ encoder_hidden_states=encoder_hidden_states,
223
+ attention_mask=encoder_attention_mask,
224
+ **cross_attention_kwargs,
225
+ )
226
+
227
+ hidden_states = attn_output + hidden_states
228
+
229
+ # 4. Feed-forward
230
+ # i2vgen doesn't have this norm 🤷‍♂️
231
+ if self.norm_type == "ada_norm_continuous":
232
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
233
+ elif not self.norm_type == "ada_norm_single":
234
+ norm_hidden_states = self.norm3(hidden_states)
235
+
236
+ if self.norm_type == "ada_norm_zero":
237
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
238
+
239
+ if self.norm_type == "ada_norm_single":
240
+ norm_hidden_states = self.norm2(hidden_states)
241
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
242
+
243
+ if self._chunk_size is not None:
244
+ # "feed_forward_chunk_size" can be used to save memory
245
+ ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
246
+ else:
247
+ ff_output = self.ff(norm_hidden_states)
248
+
249
+ if self.norm_type == "ada_norm_zero":
250
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
251
+ elif self.norm_type == "ada_norm_single":
252
+ ff_output = gate_mlp * ff_output
253
+
254
+ hidden_states = ff_output + hidden_states
255
+ if hidden_states.ndim == 4:
256
+ hidden_states = hidden_states.squeeze(1)
257
+
258
+ return hidden_states
259
+
260
+ import copy
261
+ class UNet2p5DConditionModel(torch.nn.Module):
262
+ def __init__(self, unet: UNet2DConditionModel) -> None:
263
+ super().__init__()
264
+ self.unet = unet
265
+
266
+ self.use_ma = True
267
+ self.use_ra = True
268
+ self.use_camera_embedding = True
269
+ self.use_dual_stream = True
270
+
271
+ if self.use_dual_stream:
272
+ self.unet_dual = copy.deepcopy(unet)
273
+ self.init_attention(self.unet_dual)
274
+ self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra)
275
+ self.init_condition()
276
+ self.init_camera_embedding()
277
+
278
+
279
+ @staticmethod
280
+ def from_pretrained(pretrained_model_name_or_path, **kwargs):
281
+ torch_dtype = kwargs.pop('torch_dtype', torch.float32)
282
+ config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
283
+ unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
284
+ with open(config_path, 'r', encoding='utf-8') as file:
285
+ config = json.load(file)
286
+ unet = UNet2DConditionModel(**config)
287
+ unet = UNet2p5DConditionModel(unet)
288
+ unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
289
+ unet.load_state_dict(unet_ckpt, strict=True)
290
+ unet = unet.to(torch_dtype)
291
+ return unet
292
+
293
+ def init_condition(self):
294
+ self.unet.conv_in = torch.nn.Conv2d(
295
+ 12,
296
+ self.unet.conv_in.out_channels,
297
+ kernel_size=self.unet.conv_in.kernel_size,
298
+ stride=self.unet.conv_in.stride,
299
+ padding=self.unet.conv_in.padding,
300
+ dilation=self.unet.conv_in.dilation,
301
+ groups=self.unet.conv_in.groups,
302
+ bias=self.unet.conv_in.bias is not None)
303
+ self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1,77,1024))
304
+ self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1,77,1024))
305
+
306
+ def init_camera_embedding(self):
307
+
308
+ self.max_num_ref_image = 5
309
+ self.max_num_gen_image = 12*3+4*2
310
+
311
+ if self.use_camera_embedding:
312
+ time_embed_dim = 1280
313
+ self.unet.class_embedding = nn.Embedding(self.max_num_ref_image+self.max_num_gen_image, time_embed_dim)
314
+
315
+
316
+ def init_attention(self, unet, use_ma=False, use_ra=False):
317
+
318
+ for down_block_i, down_block in enumerate(unet.down_blocks):
319
+ if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
320
+ for attn_i, attn in enumerate(down_block.attentions):
321
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
322
+ if isinstance(transformer, BasicTransformerBlock):
323
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, f'down_{down_block_i}_{attn_i}_{transformer_i}', use_ma, use_ra)
324
+
325
+
326
+ if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
327
+ for attn_i, attn in enumerate(unet.mid_block.attentions):
328
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
329
+ if isinstance(transformer, BasicTransformerBlock):
330
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, f'mid_{attn_i}_{transformer_i}', use_ma, use_ra)
331
+
332
+ for up_block_i, up_block in enumerate(unet.up_blocks):
333
+ if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
334
+ for attn_i, attn in enumerate(up_block.attentions):
335
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
336
+ if isinstance(transformer, BasicTransformerBlock):
337
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, f'up_{up_block_i}_{attn_i}_{transformer_i}', use_ma, use_ra)
338
+
339
+
340
+ def __getattr__(self, name: str):
341
+ try:
342
+ return super().__getattr__(name)
343
+ except AttributeError:
344
+ return getattr(self.unet, name)
345
+
346
+ def forward(
347
+ self, sample, timestep, encoder_hidden_states,
348
+ *args, down_intrablock_additional_residuals=None,
349
+ down_block_res_samples=None, mid_block_res_sample=None,
350
+ **cached_condition,
351
+ ):
352
+ B, N_gen, _, H, W = sample.shape
353
+ assert H == W
354
+
355
+ if self.use_camera_embedding:
356
+ camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
357
+ camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
358
+ else:
359
+ camera_info_gen = None
360
+
361
+ sample = [sample]
362
+ if 'normal_imgs' in cached_condition:
363
+ sample.append(cached_condition["normal_imgs"])
364
+ if 'position_imgs' in cached_condition:
365
+ sample.append(cached_condition["position_imgs"])
366
+ sample = torch.cat(sample, dim=2)
367
+
368
+ sample = rearrange(sample, 'b n c h w -> (b n) c h w')
369
+
370
+ encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
371
+ encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
372
+
373
+ if self.use_ra:
374
+ if 'condition_embed_dict' in cached_condition:
375
+ condition_embed_dict = cached_condition['condition_embed_dict']
376
+ else:
377
+ condition_embed_dict = {}
378
+ ref_latents = cached_condition['ref_latents']
379
+ N_ref = ref_latents.shape[1]
380
+ if self.use_camera_embedding:
381
+ camera_info_ref = cached_condition['camera_info_ref']
382
+ camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
383
+ else:
384
+ camera_info_ref = None
385
+
386
+ ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
387
+
388
+ encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
389
+ encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
390
+
391
+ noisy_ref_latents = ref_latents
392
+ timestep_ref = 0
393
+
394
+ if self.use_dual_stream:
395
+ unet_ref = self.unet_dual
396
+ else:
397
+ unet_ref = self.unet
398
+ unet_ref(
399
+ noisy_ref_latents, timestep_ref,
400
+ encoder_hidden_states=encoder_hidden_states_ref,
401
+ class_labels=camera_info_ref,
402
+ # **kwargs
403
+ return_dict=False,
404
+ cross_attention_kwargs={
405
+ 'mode':'w', 'num_in_batch':N_ref,
406
+ 'condition_embed_dict':condition_embed_dict},
407
+ )
408
+ cached_condition['condition_embed_dict'] = condition_embed_dict
409
+ else:
410
+ condition_embed_dict = None
411
+
412
+
413
+ mva_scale = cached_condition.get('mva_scale', 1.0)
414
+ ref_scale = cached_condition.get('ref_scale', 1.0)
415
+
416
+ return self.unet(
417
+ sample, timestep,
418
+ encoder_hidden_states_gen, *args,
419
+ class_labels=camera_info_gen,
420
+ down_intrablock_additional_residuals=[
421
+ sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
422
+ ] if down_intrablock_additional_residuals is not None else None,
423
+ down_block_additional_residuals=[
424
+ sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
425
+ ] if down_block_res_samples is not None else None,
426
+ mid_block_additional_residual=(
427
+ mid_block_res_sample.to(dtype=self.unet.dtype)
428
+ if mid_block_res_sample is not None else None
429
+ ),
430
+ return_dict=False,
431
+ cross_attention_kwargs={
432
+ 'mode':'r', 'num_in_batch':N_gen,
433
+ 'condition_embed_dict':condition_embed_dict,
434
+ 'mva_scale': mva_scale,
435
+ 'ref_scale': ref_scale,
436
+ },
437
+ )
hunyuan3d-paint-v2-0/vae/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "norm_num_groups": 32,
21
+ "out_channels": 3,
22
+ "sample_size": 768,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
hunyuan3d-paint-v2-0/vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
+ size 334707217