Koke_Cacao
commited on
Commit
•
1a29d24
1
Parent(s):
9db01cb
:bug: fix for sd 2.1
Browse files
scripts/convert_mvdream_to_diffusers.py
CHANGED
@@ -27,105 +27,6 @@ from transformers import CLIPTokenizer, CLIPTextModel
|
|
27 |
|
28 |
logger = logging.get_logger(__name__)
|
29 |
|
30 |
-
# def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
|
31 |
-
# """
|
32 |
-
# Creates a config for the diffusers based on the config of the LDM model.
|
33 |
-
# """
|
34 |
-
# if controlnet:
|
35 |
-
# unet_params = original_config.model.params.control_stage_config.params
|
36 |
-
# else:
|
37 |
-
# if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
|
38 |
-
# unet_params = original_config.model.params.unet_config.params
|
39 |
-
# else:
|
40 |
-
# unet_params = original_config.model.params.network_config.params
|
41 |
-
|
42 |
-
# vae_params = original_config.model.params.first_stage_config.params.ddconfig
|
43 |
-
|
44 |
-
# block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
|
45 |
-
|
46 |
-
# down_block_types = []
|
47 |
-
# resolution = 1
|
48 |
-
# for i in range(len(block_out_channels)):
|
49 |
-
# block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
|
50 |
-
# down_block_types.append(block_type)
|
51 |
-
# if i != len(block_out_channels) - 1:
|
52 |
-
# resolution *= 2
|
53 |
-
|
54 |
-
# up_block_types = []
|
55 |
-
# for i in range(len(block_out_channels)):
|
56 |
-
# block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
|
57 |
-
# up_block_types.append(block_type)
|
58 |
-
# resolution //= 2
|
59 |
-
|
60 |
-
# if unet_params.transformer_depth is not None:
|
61 |
-
# transformer_layers_per_block = (
|
62 |
-
# unet_params.transformer_depth
|
63 |
-
# if isinstance(unet_params.transformer_depth, int)
|
64 |
-
# else list(unet_params.transformer_depth)
|
65 |
-
# )
|
66 |
-
# else:
|
67 |
-
# transformer_layers_per_block = 1
|
68 |
-
|
69 |
-
# vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
|
70 |
-
|
71 |
-
# head_dim = unet_params.num_heads if "num_heads" in unet_params else None
|
72 |
-
# use_linear_projection = (
|
73 |
-
# unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
|
74 |
-
# )
|
75 |
-
# if use_linear_projection:
|
76 |
-
# # stable diffusion 2-base-512 and 2-768
|
77 |
-
# if head_dim is None:
|
78 |
-
# head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
|
79 |
-
# head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
|
80 |
-
|
81 |
-
# class_embed_type = None
|
82 |
-
# addition_embed_type = None
|
83 |
-
# addition_time_embed_dim = None
|
84 |
-
# projection_class_embeddings_input_dim = None
|
85 |
-
# context_dim = None
|
86 |
-
|
87 |
-
# if unet_params.context_dim is not None:
|
88 |
-
# context_dim = (
|
89 |
-
# unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
|
90 |
-
# )
|
91 |
-
|
92 |
-
# if "num_classes" in unet_params:
|
93 |
-
# if unet_params.num_classes == "sequential":
|
94 |
-
# if context_dim in [2048, 1280]:
|
95 |
-
# # SDXL
|
96 |
-
# addition_embed_type = "text_time"
|
97 |
-
# addition_time_embed_dim = 256
|
98 |
-
# else:
|
99 |
-
# class_embed_type = "projection"
|
100 |
-
# assert "adm_in_channels" in unet_params
|
101 |
-
# projection_class_embeddings_input_dim = unet_params.adm_in_channels
|
102 |
-
# else:
|
103 |
-
# raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
|
104 |
-
|
105 |
-
# config = {
|
106 |
-
# "sample_size": image_size // vae_scale_factor,
|
107 |
-
# "in_channels": unet_params.in_channels,
|
108 |
-
# "down_block_types": tuple(down_block_types),
|
109 |
-
# "block_out_channels": tuple(block_out_channels),
|
110 |
-
# "layers_per_block": unet_params.num_res_blocks,
|
111 |
-
# "cross_attention_dim": context_dim,
|
112 |
-
# "attention_head_dim": head_dim,
|
113 |
-
# "use_linear_projection": use_linear_projection,
|
114 |
-
# "class_embed_type": class_embed_type,
|
115 |
-
# "addition_embed_type": addition_embed_type,
|
116 |
-
# "addition_time_embed_dim": addition_time_embed_dim,
|
117 |
-
# "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
|
118 |
-
# "transformer_layers_per_block": transformer_layers_per_block,
|
119 |
-
# }
|
120 |
-
|
121 |
-
# if controlnet:
|
122 |
-
# config["conditioning_channels"] = unet_params.hint_channels
|
123 |
-
# else:
|
124 |
-
# config["out_channels"] = unet_params.out_channels
|
125 |
-
# config["up_block_types"] = tuple(up_block_types)
|
126 |
-
|
127 |
-
# return config
|
128 |
-
|
129 |
|
130 |
def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
|
131 |
"""
|
@@ -190,291 +91,6 @@ def shave_segments(path, n_shave_prefix_segments=1):
|
|
190 |
return ".".join(path.split(".")[:n_shave_prefix_segments])
|
191 |
|
192 |
|
193 |
-
def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
|
194 |
-
"""
|
195 |
-
Updates paths inside resnets to the new naming scheme (local renaming)
|
196 |
-
"""
|
197 |
-
mapping = []
|
198 |
-
for old_item in old_list:
|
199 |
-
new_item = old_item.replace("in_layers.0", "norm1")
|
200 |
-
new_item = new_item.replace("in_layers.2", "conv1")
|
201 |
-
|
202 |
-
new_item = new_item.replace("out_layers.0", "norm2")
|
203 |
-
new_item = new_item.replace("out_layers.3", "conv2")
|
204 |
-
|
205 |
-
new_item = new_item.replace("emb_layers.1", "time_emb_proj")
|
206 |
-
new_item = new_item.replace("skip_connection", "conv_shortcut")
|
207 |
-
|
208 |
-
new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
|
209 |
-
|
210 |
-
mapping.append({"old": old_item, "new": new_item})
|
211 |
-
|
212 |
-
return mapping
|
213 |
-
|
214 |
-
|
215 |
-
def renew_attention_paths(old_list, n_shave_prefix_segments=0):
|
216 |
-
"""
|
217 |
-
Updates paths inside attentions to the new naming scheme (local renaming)
|
218 |
-
"""
|
219 |
-
mapping = []
|
220 |
-
for old_item in old_list:
|
221 |
-
new_item = old_item
|
222 |
-
|
223 |
-
# new_item = new_item.replace('norm.weight', 'group_norm.weight')
|
224 |
-
# new_item = new_item.replace('norm.bias', 'group_norm.bias')
|
225 |
-
|
226 |
-
# new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
|
227 |
-
# new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
|
228 |
-
|
229 |
-
# new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
|
230 |
-
|
231 |
-
mapping.append({"old": old_item, "new": new_item})
|
232 |
-
|
233 |
-
return mapping
|
234 |
-
|
235 |
-
|
236 |
-
# def convert_ldm_unet_checkpoint(
|
237 |
-
# checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
|
238 |
-
# ):
|
239 |
-
# """
|
240 |
-
# Takes a state dict and a config, and returns a converted checkpoint.
|
241 |
-
# """
|
242 |
-
|
243 |
-
# if skip_extract_state_dict:
|
244 |
-
# unet_state_dict = checkpoint
|
245 |
-
# else:
|
246 |
-
# # extract state_dict for UNet
|
247 |
-
# unet_state_dict = {}
|
248 |
-
# keys = list(checkpoint.keys())
|
249 |
-
|
250 |
-
# if controlnet:
|
251 |
-
# unet_key = "control_model."
|
252 |
-
# else:
|
253 |
-
# unet_key = "model.diffusion_model."
|
254 |
-
|
255 |
-
# # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
|
256 |
-
# if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
|
257 |
-
# logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
|
258 |
-
# logger.warning(
|
259 |
-
# "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
|
260 |
-
# " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
|
261 |
-
# )
|
262 |
-
# for key in keys:
|
263 |
-
# if key.startswith("model.diffusion_model"):
|
264 |
-
# flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
|
265 |
-
# unet_state_dict[key.replace(unet_key, "")] = checkpoint[flat_ema_key]
|
266 |
-
# else:
|
267 |
-
# if sum(k.startswith("model_ema") for k in keys) > 100:
|
268 |
-
# logger.warning(
|
269 |
-
# "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
|
270 |
-
# " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
|
271 |
-
# )
|
272 |
-
|
273 |
-
# for key in keys:
|
274 |
-
# if key.startswith(unet_key):
|
275 |
-
# unet_state_dict[key.replace(unet_key, "")] = checkpoint[key]
|
276 |
-
|
277 |
-
# new_checkpoint = {}
|
278 |
-
|
279 |
-
# new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
|
280 |
-
# new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
|
281 |
-
# new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
|
282 |
-
# new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
|
283 |
-
|
284 |
-
# if config["class_embed_type"] is None:
|
285 |
-
# # No parameters to port
|
286 |
-
# ...
|
287 |
-
# elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
|
288 |
-
# new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
|
289 |
-
# new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
|
290 |
-
# new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
|
291 |
-
# new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
|
292 |
-
# else:
|
293 |
-
# raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
|
294 |
-
|
295 |
-
# if config["addition_embed_type"] == "text_time":
|
296 |
-
# new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
|
297 |
-
# new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
|
298 |
-
# new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
|
299 |
-
# new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
|
300 |
-
|
301 |
-
# new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
|
302 |
-
# new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
|
303 |
-
|
304 |
-
# if not controlnet:
|
305 |
-
# new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
|
306 |
-
# new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
|
307 |
-
# new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
|
308 |
-
# new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
|
309 |
-
|
310 |
-
# # Retrieves the keys for the input blocks only
|
311 |
-
# num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
|
312 |
-
# input_blocks = {
|
313 |
-
# layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
|
314 |
-
# for layer_id in range(num_input_blocks)
|
315 |
-
# }
|
316 |
-
|
317 |
-
# # Retrieves the keys for the middle blocks only
|
318 |
-
# num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
|
319 |
-
# middle_blocks = {
|
320 |
-
# layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
|
321 |
-
# for layer_id in range(num_middle_blocks)
|
322 |
-
# }
|
323 |
-
|
324 |
-
# # Retrieves the keys for the output blocks only
|
325 |
-
# num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
|
326 |
-
# output_blocks = {
|
327 |
-
# layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
|
328 |
-
# for layer_id in range(num_output_blocks)
|
329 |
-
# }
|
330 |
-
|
331 |
-
# for i in range(1, num_input_blocks):
|
332 |
-
# block_id = (i - 1) // (config["layers_per_block"] + 1)
|
333 |
-
# layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
|
334 |
-
|
335 |
-
# resnets = [
|
336 |
-
# key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
|
337 |
-
# ]
|
338 |
-
# attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
|
339 |
-
|
340 |
-
# if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
|
341 |
-
# new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
|
342 |
-
# f"input_blocks.{i}.0.op.weight"
|
343 |
-
# )
|
344 |
-
# new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
|
345 |
-
# f"input_blocks.{i}.0.op.bias"
|
346 |
-
# )
|
347 |
-
|
348 |
-
# paths = renew_resnet_paths(resnets)
|
349 |
-
# meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
|
350 |
-
# assign_to_checkpoint(
|
351 |
-
# paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
|
352 |
-
# )
|
353 |
-
|
354 |
-
# if len(attentions):
|
355 |
-
# paths = renew_attention_paths(attentions)
|
356 |
-
# meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
|
357 |
-
# assign_to_checkpoint(
|
358 |
-
# paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
|
359 |
-
# )
|
360 |
-
|
361 |
-
# resnet_0 = middle_blocks[0]
|
362 |
-
# attentions = middle_blocks[1]
|
363 |
-
# resnet_1 = middle_blocks[2]
|
364 |
-
|
365 |
-
# resnet_0_paths = renew_resnet_paths(resnet_0)
|
366 |
-
# assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
|
367 |
-
|
368 |
-
# resnet_1_paths = renew_resnet_paths(resnet_1)
|
369 |
-
# assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
|
370 |
-
|
371 |
-
# attentions_paths = renew_attention_paths(attentions)
|
372 |
-
# meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
|
373 |
-
# assign_to_checkpoint(
|
374 |
-
# attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
|
375 |
-
# )
|
376 |
-
|
377 |
-
# for i in range(num_output_blocks):
|
378 |
-
# block_id = i // (config["layers_per_block"] + 1)
|
379 |
-
# layer_in_block_id = i % (config["layers_per_block"] + 1)
|
380 |
-
# output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
|
381 |
-
# output_block_list = {}
|
382 |
-
|
383 |
-
# for layer in output_block_layers:
|
384 |
-
# layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
|
385 |
-
# if layer_id in output_block_list:
|
386 |
-
# output_block_list[layer_id].append(layer_name)
|
387 |
-
# else:
|
388 |
-
# output_block_list[layer_id] = [layer_name]
|
389 |
-
|
390 |
-
# if len(output_block_list) > 1:
|
391 |
-
# resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
|
392 |
-
# attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
|
393 |
-
|
394 |
-
# resnet_0_paths = renew_resnet_paths(resnets)
|
395 |
-
# paths = renew_resnet_paths(resnets)
|
396 |
-
|
397 |
-
# meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
|
398 |
-
# assign_to_checkpoint(
|
399 |
-
# paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
|
400 |
-
# )
|
401 |
-
|
402 |
-
# output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
|
403 |
-
# if ["conv.bias", "conv.weight"] in output_block_list.values():
|
404 |
-
# index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
|
405 |
-
# new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
|
406 |
-
# f"output_blocks.{i}.{index}.conv.weight"
|
407 |
-
# ]
|
408 |
-
# new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
|
409 |
-
# f"output_blocks.{i}.{index}.conv.bias"
|
410 |
-
# ]
|
411 |
-
|
412 |
-
# # Clear attentions as they have been attributed above.
|
413 |
-
# if len(attentions) == 2:
|
414 |
-
# attentions = []
|
415 |
-
|
416 |
-
# if len(attentions):
|
417 |
-
# paths = renew_attention_paths(attentions)
|
418 |
-
# meta_path = {
|
419 |
-
# "old": f"output_blocks.{i}.1",
|
420 |
-
# "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
|
421 |
-
# }
|
422 |
-
# assign_to_checkpoint(
|
423 |
-
# paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
|
424 |
-
# )
|
425 |
-
# else:
|
426 |
-
# resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
|
427 |
-
# for path in resnet_0_paths:
|
428 |
-
# old_path = ".".join(["output_blocks", str(i), path["old"]])
|
429 |
-
# new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
|
430 |
-
|
431 |
-
# new_checkpoint[new_path] = unet_state_dict[old_path]
|
432 |
-
|
433 |
-
# if controlnet:
|
434 |
-
# # conditioning embedding
|
435 |
-
|
436 |
-
# orig_index = 0
|
437 |
-
|
438 |
-
# new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
|
439 |
-
# f"input_hint_block.{orig_index}.weight"
|
440 |
-
# )
|
441 |
-
# new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
|
442 |
-
# f"input_hint_block.{orig_index}.bias"
|
443 |
-
# )
|
444 |
-
|
445 |
-
# orig_index += 2
|
446 |
-
|
447 |
-
# diffusers_index = 0
|
448 |
-
|
449 |
-
# while diffusers_index < 6:
|
450 |
-
# new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
|
451 |
-
# f"input_hint_block.{orig_index}.weight"
|
452 |
-
# )
|
453 |
-
# new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
|
454 |
-
# f"input_hint_block.{orig_index}.bias"
|
455 |
-
# )
|
456 |
-
# diffusers_index += 1
|
457 |
-
# orig_index += 2
|
458 |
-
|
459 |
-
# new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
|
460 |
-
# f"input_hint_block.{orig_index}.weight"
|
461 |
-
# )
|
462 |
-
# new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
|
463 |
-
# f"input_hint_block.{orig_index}.bias"
|
464 |
-
# )
|
465 |
-
|
466 |
-
# # down blocks
|
467 |
-
# for i in range(num_input_blocks):
|
468 |
-
# new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
|
469 |
-
# new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
|
470 |
-
|
471 |
-
# # mid block
|
472 |
-
# new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
|
473 |
-
# new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
|
474 |
-
|
475 |
-
# return new_checkpoint
|
476 |
-
|
477 |
-
|
478 |
def create_vae_diffusers_config(original_config, image_size: int):
|
479 |
"""
|
480 |
Creates a config for the diffusers based on the config of the LDM model.
|
@@ -706,8 +322,14 @@ def convert_from_original_mvdream_ckpt(checkpoint_path, original_config_file, de
|
|
706 |
with init_empty_weights():
|
707 |
vae = AutoencoderKL(**vae_config)
|
708 |
|
709 |
-
|
710 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
|
712 |
for param_name, param in converted_vae_checkpoint.items():
|
713 |
set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)
|
|
|
27 |
|
28 |
logger = logging.get_logger(__name__)
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
|
32 |
"""
|
|
|
91 |
return ".".join(path.split(".")[:n_shave_prefix_segments])
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def create_vae_diffusers_config(original_config, image_size: int):
|
95 |
"""
|
96 |
Creates a config for the diffusers based on the config of the LDM model.
|
|
|
322 |
with init_empty_weights():
|
323 |
vae = AutoencoderKL(**vae_config)
|
324 |
|
325 |
+
if original_config.model.params.unet_config.params.context_dim == 768:
|
326 |
+
tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
327 |
+
text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device=torch.device("cuda:0")) # type: ignore
|
328 |
+
elif original_config.model.params.unet_config.params.context_dim == 1024:
|
329 |
+
tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="tokenizer")
|
330 |
+
text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="text_encoder").to(device=torch.device("cuda:0")) # type: ignore
|
331 |
+
else:
|
332 |
+
raise ValueError(f"Unknown context_dim: {original_config.model.paams.unet_config.params.context_dim}")
|
333 |
|
334 |
for param_name, param in converted_vae_checkpoint.items():
|
335 |
set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)
|