qninhdt
/

DiffMorpher

Model card Files Files and versions Community

DiffMorpher / model.py

qninhdt

Upload 53 files

bf63092 verified 2 days ago

raw

history blame

26.6 kB

	import os
	from diffusers.models import AutoencoderKL, UNet2DConditionModel
	from diffusers.models.attention_processor import AttnProcessor
	from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
	from diffusers.schedulers import KarrasDiffusionSchedulers
	import torch
	import torch.nn.functional as F
	import tqdm
	import numpy as np
	import safetensors
	from PIL import Image
	from torchvision import transforms
	from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
	from diffusers import StableDiffusionPipeline
	from argparse import ArgumentParser
	import inspect

	from utils.model_utils import get_img, slerp, do_replace_attn
	from utils.lora_utils import train_lora, load_lora
	from utils.alpha_scheduler import AlphaScheduler


	class StoreProcessor():
	def __init__(self, original_processor, value_dict, name):
	self.original_processor = original_processor
	self.value_dict = value_dict
	self.name = name
	self.value_dict[self.name] = dict()
	self.id = 0

	def __call__(self, attn, hidden_states, args, encoder_hidden_states=None, attention_mask=None, *kwargs):
	# Is self attention
	if encoder_hidden_states is None:
	self.value_dict[self.name][self.id] = hidden_states.detach()
	self.id += 1
	res = self.original_processor(attn, hidden_states, *args,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**kwargs)

	return res


	class LoadProcessor():
	def __init__(self, original_processor, name, img0_dict, img1_dict, alpha, beta=0, lamd=0.6):
	super().__init__()
	self.original_processor = original_processor
	self.name = name
	self.img0_dict = img0_dict
	self.img1_dict = img1_dict
	self.alpha = alpha
	self.beta = beta
	self.lamd = lamd
	self.id = 0

	def __call__(self, attn, hidden_states, args, encoder_hidden_states=None, attention_mask=None, *kwargs):
	# Is self attention
	if encoder_hidden_states is None:
	if self.id < 50 * self.lamd:
	map0 = self.img0_dict[self.name][self.id]
	map1 = self.img1_dict[self.name][self.id]
	cross_map = self.beta * hidden_states + \
	(1 - self.beta) * ((1 - self.alpha) * map0 + self.alpha * map1)
	# cross_map = self.beta * hidden_states + \
	# (1 - self.beta) * slerp(map0, map1, self.alpha)
	# cross_map = slerp(slerp(map0, map1, self.alpha),
	# hidden_states, self.beta)
	# cross_map = hidden_states
	# cross_map = torch.cat(
	# ((1 - self.alpha) * map0, self.alpha * map1), dim=1)

	res = self.original_processor(attn, hidden_states, *args,
	encoder_hidden_states=cross_map,
	attention_mask=attention_mask,
	**kwargs)
	else:
	res = self.original_processor(attn, hidden_states, *args,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**kwargs)

	self.id += 1
	# if self.id == len(self.img0_dict[self.name]):
	if self.id == len(self.img0_dict[self.name]):
	self.id = 0
	else:
	res = self.original_processor(attn, hidden_states, *args,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**kwargs)

	return res


	class DiffMorpherPipeline(StableDiffusionPipeline):

	def __init__(self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: KarrasDiffusionSchedulers,
	safety_checker: StableDiffusionSafetyChecker,
	feature_extractor: CLIPImageProcessor,
	image_encoder=None,
	requires_safety_checker: bool = True,
	):
	sig = inspect.signature(super().__init__)
	params = sig.parameters
	if 'image_encoder' in params:
	super().__init__(vae, text_encoder, tokenizer, unet, scheduler,
	safety_checker, feature_extractor, image_encoder, requires_safety_checker)
	else:
	super().__init__(vae, text_encoder, tokenizer, unet, scheduler,
	safety_checker, feature_extractor, requires_safety_checker)
	self.img0_dict = dict()
	self.img1_dict = dict()

	def inv_step(
	self,
	model_output: torch.FloatTensor,
	timestep: int,
	x: torch.FloatTensor,
	eta=0.,
	verbose=False
	):
	"""
	Inverse sampling for DDIM Inversion
	"""
	if verbose:
	print("timestep: ", timestep)
	next_step = timestep
	timestep = min(timestep - self.scheduler.config.num_train_timesteps //
	self.scheduler.num_inference_steps, 999)
	alpha_prod_t = self.scheduler.alphas_cumprod[
	timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
	alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
	beta_prod_t = 1 - alpha_prod_t
	pred_x0 = (x - beta_prod_t*0.5 model_output) / alpha_prod_t**0.5
	pred_dir = (1 - alpha_prod_t_next)*0.5 model_output
	x_next = alpha_prod_t_next*0.5 pred_x0 + pred_dir
	return x_next, pred_x0

	@torch.no_grad()
	def invert(
	self,
	image: torch.Tensor,
	prompt,
	num_inference_steps=50,
	num_actual_inference_steps=None,
	guidance_scale=1.,
	eta=0.0,
	**kwds):
	"""
	invert a real image into noise map with determinisc DDIM inversion
	"""
	DEVICE = torch.device(
	"cuda") if torch.cuda.is_available() else torch.device("cpu")
	batch_size = image.shape[0]
	if isinstance(prompt, list):
	if batch_size == 1:
	image = image.expand(len(prompt), -1, -1, -1)
	elif isinstance(prompt, str):
	if batch_size > 1:
	prompt = [prompt] * batch_size

	# text embeddings
	text_input = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=77,
	return_tensors="pt"
	)
	text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
	print("input text embeddings :", text_embeddings.shape)
	# define initial latents
	latents = self.image2latent(image)

	# unconditional embedding for classifier free guidance
	if guidance_scale > 1.:
	max_length = text_input.input_ids.shape[-1]
	unconditional_input = self.tokenizer(
	[""] * batch_size,
	padding="max_length",
	max_length=77,
	return_tensors="pt"
	)
	unconditional_embeddings = self.text_encoder(
	unconditional_input.input_ids.to(DEVICE))[0]
	text_embeddings = torch.cat(
	[unconditional_embeddings, text_embeddings], dim=0)

	print("latents shape: ", latents.shape)
	# interative sampling
	self.scheduler.set_timesteps(num_inference_steps)
	print("Valid timesteps: ", reversed(self.scheduler.timesteps))
	# print("attributes: ", self.scheduler.__dict__)
	latents_list = [latents]
	pred_x0_list = [latents]
	for i, t in enumerate(tqdm.tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
	if num_actual_inference_steps is not None and i >= num_actual_inference_steps:
	continue

	if guidance_scale > 1.:
	model_inputs = torch.cat([latents] * 2)
	else:
	model_inputs = latents

	# predict the noise
	noise_pred = self.unet(
	model_inputs, t, encoder_hidden_states=text_embeddings).sample
	if guidance_scale > 1.:
	noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
	noise_pred = noise_pred_uncon + guidance_scale * \
	(noise_pred_con - noise_pred_uncon)
	# compute the previous noise sample x_t-1 -> x_t
	latents, pred_x0 = self.inv_step(noise_pred, t, latents)
	latents_list.append(latents)
	pred_x0_list.append(pred_x0)

	return latents

	@torch.no_grad()
	def ddim_inversion(self, latent, cond):
	timesteps = reversed(self.scheduler.timesteps)
	with torch.autocast(device_type='cuda', dtype=torch.float32):
	for i, t in enumerate(tqdm.tqdm(timesteps, desc="DDIM inversion")):
	cond_batch = cond.repeat(latent.shape[0], 1, 1)

	alpha_prod_t = self.scheduler.alphas_cumprod[t]
	alpha_prod_t_prev = (
	self.scheduler.alphas_cumprod[timesteps[i - 1]]
	if i > 0 else self.scheduler.final_alpha_cumprod
	)

	mu = alpha_prod_t ** 0.5
	mu_prev = alpha_prod_t_prev ** 0.5
	sigma = (1 - alpha_prod_t) ** 0.5
	sigma_prev = (1 - alpha_prod_t_prev) ** 0.5

	eps = self.unet(
	latent, t, encoder_hidden_states=cond_batch).sample

	pred_x0 = (latent - sigma_prev * eps) / mu_prev
	latent = mu * pred_x0 + sigma * eps
	# if save_latents:
	# torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt'))
	# torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt'))
	return latent

	def step(
	self,
	model_output: torch.FloatTensor,
	timestep: int,
	x: torch.FloatTensor,
	):
	"""
	predict the sample of the next step in the denoise process.
	"""
	prev_timestep = timestep - \
	self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
	alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
	alpha_prod_t_prev = self.scheduler.alphas_cumprod[
	prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
	beta_prod_t = 1 - alpha_prod_t
	pred_x0 = (x - beta_prod_t*0.5 model_output) / alpha_prod_t**0.5
	pred_dir = (1 - alpha_prod_t_prev)*0.5 model_output
	x_prev = alpha_prod_t_prev*0.5 pred_x0 + pred_dir
	return x_prev, pred_x0

	@torch.no_grad()
	def image2latent(self, image):
	DEVICE = torch.device(
	"cuda") if torch.cuda.is_available() else torch.device("cpu")
	if type(image) is Image:
	image = np.array(image)
	image = torch.from_numpy(image).float() / 127.5 - 1
	image = image.permute(2, 0, 1).unsqueeze(0)
	# input image density range [-1, 1]
	latents = self.vae.encode(image.to(DEVICE))['latent_dist'].mean
	latents = latents * 0.18215
	return latents

	@torch.no_grad()
	def latent2image(self, latents, return_type='np'):
	latents = 1 / 0.18215 * latents.detach()
	image = self.vae.decode(latents)['sample']
	if return_type == 'np':
	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
	image = (image * 255).astype(np.uint8)
	elif return_type == "pt":
	image = (image / 2 + 0.5).clamp(0, 1)

	return image

	def latent2image_grad(self, latents):
	latents = 1 / 0.18215 * latents
	image = self.vae.decode(latents)['sample']

	return image # range [-1, 1]

	@torch.no_grad()
	def cal_latent(self, num_inference_steps, guidance_scale, unconditioning, img_noise_0, img_noise_1, text_embeddings_0, text_embeddings_1, lora_0, lora_1, alpha, use_lora, fix_lora=None):
	# latents = torch.cos(alpha * torch.pi / 2) * img_noise_0 + \
	# torch.sin(alpha * torch.pi / 2) * img_noise_1
	# latents = (1 - alpha) * img_noise_0 + alpha * img_noise_1
	# latents = latents / ((1 - alpha) 2 + alpha 2)
	latents = slerp(img_noise_0, img_noise_1, alpha, self.use_adain)
	text_embeddings = (1 - alpha) * text_embeddings_0 + \
	alpha * text_embeddings_1

	self.scheduler.set_timesteps(num_inference_steps)
	if use_lora:
	if fix_lora is not None:
	self.unet = load_lora(self.unet, lora_0, lora_1, fix_lora)
	else:
	self.unet = load_lora(self.unet, lora_0, lora_1, alpha)

	for i, t in enumerate(tqdm.tqdm(self.scheduler.timesteps, desc=f"DDIM Sampler, alpha={alpha}")):

	if guidance_scale > 1.:
	model_inputs = torch.cat([latents] * 2)
	else:
	model_inputs = latents
	if unconditioning is not None and isinstance(unconditioning, list):
	_, text_embeddings = text_embeddings.chunk(2)
	text_embeddings = torch.cat(
	[unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
	# predict the noise
	noise_pred = self.unet(
	model_inputs, t, encoder_hidden_states=text_embeddings).sample
	if guidance_scale > 1.0:
	noise_pred_uncon, noise_pred_con = noise_pred.chunk(
	2, dim=0)
	noise_pred = noise_pred_uncon + guidance_scale * \
	(noise_pred_con - noise_pred_uncon)
	# compute the previous noise sample x_t -> x_t-1
	latents = self.scheduler.step(
	noise_pred, t, latents, return_dict=False)[0]
	return latents

	@torch.no_grad()
	def get_text_embeddings(self, prompt, guidance_scale, neg_prompt, batch_size):
	DEVICE = torch.device(
	"cuda") if torch.cuda.is_available() else torch.device("cpu")
	# text embeddings
	text_input = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=77,
	return_tensors="pt"
	)
	text_embeddings = self.text_encoder(text_input.input_ids.cuda())[0]

	if guidance_scale > 1.:
	if neg_prompt:
	uc_text = neg_prompt
	else:
	uc_text = ""
	unconditional_input = self.tokenizer(
	[uc_text] * batch_size,
	padding="max_length",
	max_length=77,
	return_tensors="pt"
	)
	unconditional_embeddings = self.text_encoder(
	unconditional_input.input_ids.to(DEVICE))[0]
	text_embeddings = torch.cat(
	[unconditional_embeddings, text_embeddings], dim=0)

	return text_embeddings

	def __call__(
	self,
	img_0=None,
	img_1=None,
	img_path_0=None,
	img_path_1=None,
	prompt_0="",
	prompt_1="",
	save_lora_dir="./lora",
	load_lora_path_0=None,
	load_lora_path_1=None,
	lora_steps=200,
	lora_lr=2e-4,
	lora_rank=16,
	batch_size=1,
	height=512,
	width=512,
	num_inference_steps=50,
	num_actual_inference_steps=None,
	guidance_scale=1,
	attn_beta=0,
	lamd=0.6,
	use_lora=True,
	use_adain=True,
	use_reschedule=True,
	output_path="./results",
	num_frames=50,
	fix_lora=None,
	progress=tqdm,
	unconditioning=None,
	neg_prompt=None,
	save_intermediates=False,
	**kwds):

	# if isinstance(prompt, list):
	# batch_size = len(prompt)
	# elif isinstance(prompt, str):
	# if batch_size > 1:
	# prompt = [prompt] * batch_size
	self.scheduler.set_timesteps(num_inference_steps)
	self.use_lora = use_lora
	self.use_adain = use_adain
	self.use_reschedule = use_reschedule
	self.output_path = output_path

	if img_0 is None:
	img_0 = Image.open(img_path_0).convert("RGB")
	# else:
	# img_0 = Image.fromarray(img_0).convert("RGB")

	if img_1 is None:
	img_1 = Image.open(img_path_1).convert("RGB")
	# else:
	# img_1 = Image.fromarray(img_1).convert("RGB")

	if self.use_lora:
	print("Loading lora...")
	if not load_lora_path_0:

	weight_name = f"{output_path.split('/')[-1]}_lora_0.ckpt"
	load_lora_path_0 = save_lora_dir + "/" + weight_name
	if not os.path.exists(load_lora_path_0):
	train_lora(img_0, prompt_0, save_lora_dir, None, self.tokenizer, self.text_encoder,
	self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
	print(f"Load from {load_lora_path_0}.")
	if load_lora_path_0.endswith(".safetensors"):
	lora_0 = safetensors.torch.load_file(
	load_lora_path_0, device="cpu")
	else:
	lora_0 = torch.load(load_lora_path_0, map_location="cpu")

	if not load_lora_path_1:
	weight_name = f"{output_path.split('/')[-1]}_lora_1.ckpt"
	load_lora_path_1 = save_lora_dir + "/" + weight_name
	if not os.path.exists(load_lora_path_1):
	train_lora(img_1, prompt_1, save_lora_dir, None, self.tokenizer, self.text_encoder,
	self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
	print(f"Load from {load_lora_path_1}.")
	if load_lora_path_1.endswith(".safetensors"):
	lora_1 = safetensors.torch.load_file(
	load_lora_path_1, device="cpu")
	else:
	lora_1 = torch.load(load_lora_path_1, map_location="cpu")
	else:
	lora_0 = lora_1 = None

	text_embeddings_0 = self.get_text_embeddings(
	prompt_0, guidance_scale, neg_prompt, batch_size)
	text_embeddings_1 = self.get_text_embeddings(
	prompt_1, guidance_scale, neg_prompt, batch_size)
	img_0 = get_img(img_0)
	img_1 = get_img(img_1)
	if self.use_lora:
	self.unet = load_lora(self.unet, lora_0, lora_1, 0)
	img_noise_0 = self.ddim_inversion(
	self.image2latent(img_0), text_embeddings_0)
	if self.use_lora:
	self.unet = load_lora(self.unet, lora_0, lora_1, 1)
	img_noise_1 = self.ddim_inversion(
	self.image2latent(img_1), text_embeddings_1)

	print("latents shape: ", img_noise_0.shape)

	original_processor = list(self.unet.attn_processors.values())[0]

	def morph(alpha_list, progress, desc):
	images = []
	if attn_beta is not None:
	if self.use_lora:
	self.unet = load_lora(
	self.unet, lora_0, lora_1, 0 if fix_lora is None else fix_lora)

	attn_processor_dict = {}
	for k in self.unet.attn_processors.keys():
	if do_replace_attn(k):
	if self.use_lora:
	attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
	self.img0_dict, k)
	else:
	attn_processor_dict[k] = StoreProcessor(original_processor,
	self.img0_dict, k)
	else:
	attn_processor_dict[k] = self.unet.attn_processors[k]
	self.unet.set_attn_processor(attn_processor_dict)

	latents = self.cal_latent(
	num_inference_steps,
	guidance_scale,
	unconditioning,
	img_noise_0,
	img_noise_1,
	text_embeddings_0,
	text_embeddings_1,
	lora_0,
	lora_1,
	alpha_list[0],
	False,
	fix_lora
	)
	first_image = self.latent2image(latents)
	first_image = Image.fromarray(first_image)
	if save_intermediates:
	first_image.save(f"{self.output_path}/{0:02d}.png")

	if self.use_lora:
	self.unet = load_lora(
	self.unet, lora_0, lora_1, 1 if fix_lora is None else fix_lora)
	attn_processor_dict = {}
	for k in self.unet.attn_processors.keys():
	if do_replace_attn(k):
	if self.use_lora:
	attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
	self.img1_dict, k)
	else:
	attn_processor_dict[k] = StoreProcessor(original_processor,
	self.img1_dict, k)
	else:
	attn_processor_dict[k] = self.unet.attn_processors[k]

	self.unet.set_attn_processor(attn_processor_dict)

	latents = self.cal_latent(
	num_inference_steps,
	guidance_scale,
	unconditioning,
	img_noise_0,
	img_noise_1,
	text_embeddings_0,
	text_embeddings_1,
	lora_0,
	lora_1,
	alpha_list[-1],
	False,
	fix_lora
	)
	last_image = self.latent2image(latents)
	last_image = Image.fromarray(last_image)
	if save_intermediates:
	last_image.save(
	f"{self.output_path}/{num_frames - 1:02d}.png")

	for i in progress.tqdm(range(1, num_frames - 1), desc=desc):
	alpha = alpha_list[i]
	if self.use_lora:
	self.unet = load_lora(
	self.unet, lora_0, lora_1, alpha if fix_lora is None else fix_lora)

	attn_processor_dict = {}
	for k in self.unet.attn_processors.keys():
	if do_replace_attn(k):
	if self.use_lora:
	attn_processor_dict[k] = LoadProcessor(
	self.unet.attn_processors[k], k, self.img0_dict, self.img1_dict, alpha, attn_beta, lamd)
	else:
	attn_processor_dict[k] = LoadProcessor(
	original_processor, k, self.img0_dict, self.img1_dict, alpha, attn_beta, lamd)
	else:
	attn_processor_dict[k] = self.unet.attn_processors[k]

	self.unet.set_attn_processor(attn_processor_dict)

	latents = self.cal_latent(
	num_inference_steps,
	guidance_scale,
	unconditioning,
	img_noise_0,
	img_noise_1,
	text_embeddings_0,
	text_embeddings_1,
	lora_0,
	lora_1,
	alpha_list[i],
	False,
	fix_lora
	)
	image = self.latent2image(latents)
	image = Image.fromarray(image)
	if save_intermediates:
	image.save(f"{self.output_path}/{i:02d}.png")
	images.append(image)

	images = [first_image] + images + [last_image]

	else:
	for k, alpha in enumerate(alpha_list):

	latents = self.cal_latent(
	num_inference_steps,
	guidance_scale,
	unconditioning,
	img_noise_0,
	img_noise_1,
	text_embeddings_0,
	text_embeddings_1,
	lora_0,
	lora_1,
	alpha_list[k],
	self.use_lora,
	fix_lora
	)
	image = self.latent2image(latents)
	image = Image.fromarray(image)
	if save_intermediates:
	image.save(f"{self.output_path}/{k:02d}.png")
	images.append(image)

	return images

	with torch.no_grad():
	if self.use_reschedule:
	alpha_scheduler = AlphaScheduler()
	alpha_list = list(torch.linspace(0, 1, num_frames))
	images_pt = morph(alpha_list, progress, "Sampling...")
	images_pt = [transforms.ToTensor()(img).unsqueeze(0)
	for img in images_pt]
	alpha_scheduler.from_imgs(images_pt)
	alpha_list = alpha_scheduler.get_list()
	print(alpha_list)
	images = morph(alpha_list, progress, "Reschedule..."
	)
	else:
	alpha_list = list(torch.linspace(0, 1, num_frames))
	print(alpha_list)
	images = morph(alpha_list, progress, "Sampling...")

	return images