StableDiffusionVideoTo3D

Runtime error

StableDiffusionVideoTo3D / sgm /modules /autoencoding /temporal_ae.py

heheyas

init

cfb7702 8 months ago

11.6 kB

	from typing import Callable, Iterable, Union

	import torch
	from einops import rearrange, repeat

	from sgm.modules.diffusionmodules.model import (
	XFORMERS_IS_AVAILABLE,
	AttnBlock,
	Decoder,
	MemoryEfficientAttnBlock,
	ResnetBlock,
	)
	from sgm.modules.diffusionmodules.openaimodel import ResBlock, timestep_embedding
	from sgm.modules.video_attention import VideoTransformerBlock
	from sgm.util import partialclass


	class VideoResBlock(ResnetBlock):
	def __init__(
	self,
	out_channels,
	*args,
	dropout=0.0,
	video_kernel_size=3,
	alpha=0.0,
	merge_strategy="learned",
	**kwargs,
	):
	super().__init__(out_channels=out_channels, dropout=dropout, args, *kwargs)
	if video_kernel_size is None:
	video_kernel_size = [3, 1, 1]
	self.time_stack = ResBlock(
	channels=out_channels,
	emb_channels=0,
	dropout=dropout,
	dims=3,
	use_scale_shift_norm=False,
	use_conv=False,
	up=False,
	down=False,
	kernel_size=video_kernel_size,
	use_checkpoint=False,
	skip_t_emb=True,
	)

	self.merge_strategy = merge_strategy
	if self.merge_strategy == "fixed":
	self.register_buffer("mix_factor", torch.Tensor([alpha]))
	elif self.merge_strategy == "learned":
	self.register_parameter(
	"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
	)
	else:
	raise ValueError(f"unknown merge strategy {self.merge_strategy}")

	def get_alpha(self, bs):
	if self.merge_strategy == "fixed":
	return self.mix_factor
	elif self.merge_strategy == "learned":
	return torch.sigmoid(self.mix_factor)
	else:
	raise NotImplementedError()

	def forward(self, x, temb, skip_video=False, timesteps=None):
	if timesteps is None:
	timesteps = self.timesteps

	b, c, h, w = x.shape

	x = super().forward(x, temb)

	if not skip_video:
	x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)

	x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)

	x = self.time_stack(x, temb)

	alpha = self.get_alpha(bs=b // timesteps)
	x = alpha * x + (1.0 - alpha) * x_mix

	x = rearrange(x, "b c t h w -> (b t) c h w")
	return x


	class AE3DConv(torch.nn.Conv2d):
	def __init__(self, in_channels, out_channels, video_kernel_size=3, args, *kwargs):
	super().__init__(in_channels, out_channels, args, *kwargs)
	if isinstance(video_kernel_size, Iterable):
	padding = [int(k // 2) for k in video_kernel_size]
	else:
	padding = int(video_kernel_size // 2)

	self.time_mix_conv = torch.nn.Conv3d(
	in_channels=out_channels,
	out_channels=out_channels,
	kernel_size=video_kernel_size,
	padding=padding,
	)

	def forward(self, input, timesteps, skip_video=False):
	x = super().forward(input)
	if skip_video:
	return x
	x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
	x = self.time_mix_conv(x)
	return rearrange(x, "b c t h w -> (b t) c h w")


	class VideoBlock(AttnBlock):
	def __init__(
	self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
	):
	super().__init__(in_channels)
	# no context, single headed, as in base class
	self.time_mix_block = VideoTransformerBlock(
	dim=in_channels,
	n_heads=1,
	d_head=in_channels,
	checkpoint=False,
	ff_in=True,
	attn_mode="softmax",
	)

	time_embed_dim = self.in_channels * 4
	self.video_time_embed = torch.nn.Sequential(
	torch.nn.Linear(self.in_channels, time_embed_dim),
	torch.nn.SiLU(),
	torch.nn.Linear(time_embed_dim, self.in_channels),
	)

	self.merge_strategy = merge_strategy
	if self.merge_strategy == "fixed":
	self.register_buffer("mix_factor", torch.Tensor([alpha]))
	elif self.merge_strategy == "learned":
	self.register_parameter(
	"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
	)
	else:
	raise ValueError(f"unknown merge strategy {self.merge_strategy}")

	def forward(self, x, timesteps, skip_video=False):
	if skip_video:
	return super().forward(x)

	x_in = x
	x = self.attention(x)
	h, w = x.shape[2:]
	x = rearrange(x, "b c h w -> b (h w) c")

	x_mix = x
	num_frames = torch.arange(timesteps, device=x.device)
	num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
	num_frames = rearrange(num_frames, "b t -> (b t)")
	t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
	emb = self.video_time_embed(t_emb) # b, n_channels
	emb = emb[:, None, :]
	x_mix = x_mix + emb

	alpha = self.get_alpha()
	x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
	x = alpha * x + (1.0 - alpha) * x_mix # alpha merge

	x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
	x = self.proj_out(x)

	return x_in + x

	def get_alpha(
	self,
	):
	if self.merge_strategy == "fixed":
	return self.mix_factor
	elif self.merge_strategy == "learned":
	return torch.sigmoid(self.mix_factor)
	else:
	raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")


	class MemoryEfficientVideoBlock(MemoryEfficientAttnBlock):
	def __init__(
	self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
	):
	super().__init__(in_channels)
	# no context, single headed, as in base class
	self.time_mix_block = VideoTransformerBlock(
	dim=in_channels,
	n_heads=1,
	d_head=in_channels,
	checkpoint=False,
	ff_in=True,
	attn_mode="softmax-xformers",
	)

	time_embed_dim = self.in_channels * 4
	self.video_time_embed = torch.nn.Sequential(
	torch.nn.Linear(self.in_channels, time_embed_dim),
	torch.nn.SiLU(),
	torch.nn.Linear(time_embed_dim, self.in_channels),
	)

	self.merge_strategy = merge_strategy
	if self.merge_strategy == "fixed":
	self.register_buffer("mix_factor", torch.Tensor([alpha]))
	elif self.merge_strategy == "learned":
	self.register_parameter(
	"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
	)
	else:
	raise ValueError(f"unknown merge strategy {self.merge_strategy}")

	def forward(self, x, timesteps, skip_time_block=False):
	if skip_time_block:
	return super().forward(x)

	x_in = x
	x = self.attention(x)
	h, w = x.shape[2:]
	x = rearrange(x, "b c h w -> b (h w) c")

	x_mix = x
	num_frames = torch.arange(timesteps, device=x.device)
	num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
	num_frames = rearrange(num_frames, "b t -> (b t)")
	t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
	emb = self.video_time_embed(t_emb) # b, n_channels
	emb = emb[:, None, :]
	x_mix = x_mix + emb

	alpha = self.get_alpha()
	x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
	x = alpha * x + (1.0 - alpha) * x_mix # alpha merge

	x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
	x = self.proj_out(x)

	return x_in + x

	def get_alpha(
	self,
	):
	if self.merge_strategy == "fixed":
	return self.mix_factor
	elif self.merge_strategy == "learned":
	return torch.sigmoid(self.mix_factor)
	else:
	raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")


	def make_time_attn(
	in_channels,
	attn_type="vanilla",
	attn_kwargs=None,
	alpha: float = 0,
	merge_strategy: str = "learned",
	):
	assert attn_type in [
	"vanilla",
	"vanilla-xformers",
	], f"attn_type {attn_type} not supported for spatio-temporal attention"
	print(
	f"making spatial and temporal attention of type '{attn_type}' with {in_channels} in_channels"
	)
	if not XFORMERS_IS_AVAILABLE and attn_type == "vanilla-xformers":
	print(
	f"Attention mode '{attn_type}' is not available. Falling back to vanilla attention. "
	f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}"
	)
	attn_type = "vanilla"

	if attn_type == "vanilla":
	assert attn_kwargs is None
	return partialclass(
	VideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy
	)
	elif attn_type == "vanilla-xformers":
	print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
	return partialclass(
	MemoryEfficientVideoBlock,
	in_channels,
	alpha=alpha,
	merge_strategy=merge_strategy,
	)
	else:
	return NotImplementedError()


	class Conv2DWrapper(torch.nn.Conv2d):
	def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor:
	return super().forward(input)


	class VideoDecoder(Decoder):
	available_time_modes = ["all", "conv-only", "attn-only"]

	def __init__(
	self,
	*args,
	video_kernel_size: Union[int, list] = 3,
	alpha: float = 0.0,
	merge_strategy: str = "learned",
	time_mode: str = "conv-only",
	**kwargs,
	):
	self.video_kernel_size = video_kernel_size
	self.alpha = alpha
	self.merge_strategy = merge_strategy
	self.time_mode = time_mode
	assert (
	self.time_mode in self.available_time_modes
	), f"time_mode parameter has to be in {self.available_time_modes}"
	super().__init__(args, *kwargs)

	def get_last_layer(self, skip_time_mix=False, **kwargs):
	if self.time_mode == "attn-only":
	raise NotImplementedError("TODO")
	else:
	return (
	self.conv_out.time_mix_conv.weight
	if not skip_time_mix
	else self.conv_out.weight
	)

	def _make_attn(self) -> Callable:
	if self.time_mode not in ["conv-only", "only-last-conv"]:
	return partialclass(
	make_time_attn,
	alpha=self.alpha,
	merge_strategy=self.merge_strategy,
	)
	else:
	return super()._make_attn()

	def _make_conv(self) -> Callable:
	if self.time_mode != "attn-only":
	return partialclass(AE3DConv, video_kernel_size=self.video_kernel_size)
	else:
	return Conv2DWrapper

	def _make_resblock(self) -> Callable:
	if self.time_mode not in ["attn-only", "only-last-conv"]:
	return partialclass(
	VideoResBlock,
	video_kernel_size=self.video_kernel_size,
	alpha=self.alpha,
	merge_strategy=self.merge_strategy,
	)
	else:
	return super()._make_resblock()