multimodalart's picture
Upload folder using huggingface_hub
5a510e7 verified
raw
history blame
40.1 kB
# pylint: disable=R0801
# pylint: disable=C0303
"""
This module contains various transformer blocks for different applications, such as BasicTransformerBlock,
TemporalBasicTransformerBlock, and AudioTemporalBasicTransformerBlock. These blocks are used in various models,
such as GLIGEN, UNet, and others. The transformer blocks implement self-attention, cross-attention, feed-forward
networks, and other related functions.
Functions and classes included in this module are:
- BasicTransformerBlock: A basic transformer block with self-attention, cross-attention, and feed-forward layers.
- TemporalBasicTransformerBlock: A transformer block with additional temporal attention mechanisms for video data.
- AudioTemporalBasicTransformerBlock: A transformer block with additional audio-specific mechanisms for audio data.
- zero_module: A function to zero out the parameters of a given module.
For more information on each specific class and function, please refer to the respective docstrings.
"""
from typing import Any, Dict, List, Optional
import torch
from diffusers.models.attention import (AdaLayerNorm, AdaLayerNormZero,
Attention, FeedForward)
from diffusers.models.embeddings import SinusoidalPositionalEmbedding
from einops import rearrange
from torch import nn
class GatedSelfAttentionDense(nn.Module):
"""
A gated self-attention dense layer that combines visual features and object features.
Parameters:
query_dim (`int`): The number of channels in the query.
context_dim (`int`): The number of channels in the context.
n_heads (`int`): The number of heads to use for attention.
d_head (`int`): The number of channels in each head.
"""
def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
super().__init__()
# we need a linear projection since we need cat visual feature and obj feature
self.linear = nn.Linear(context_dim, query_dim)
self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
self.ff = FeedForward(query_dim, activation_fn="geglu")
self.norm1 = nn.LayerNorm(query_dim)
self.norm2 = nn.LayerNorm(query_dim)
self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
self.enabled = True
def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
"""
Apply the Gated Self-Attention mechanism to the input tensor `x` and object tensor `objs`.
Args:
x (torch.Tensor): The input tensor.
objs (torch.Tensor): The object tensor.
Returns:
torch.Tensor: The output tensor after applying Gated Self-Attention.
"""
if not self.enabled:
return x
n_visual = x.shape[1]
objs = self.linear(objs)
x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
return x
class BasicTransformerBlock(nn.Module):
r"""
A basic Transformer block.
Parameters:
dim (`int`): The number of channels in the input and output.
num_attention_heads (`int`): The number of heads to use for multi-head attention.
attention_head_dim (`int`): The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
num_embeds_ada_norm (:
obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
attention_bias (:
obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
only_cross_attention (`bool`, *optional*):
Whether to use only cross-attention layers. In this case two cross attention layers are used.
double_self_attention (`bool`, *optional*):
Whether to use two self-attention layers. In this case no cross attention layers are used.
upcast_attention (`bool`, *optional*):
Whether to upcast the attention computation to float32. This is useful for mixed precision training.
norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
Whether to use learnable elementwise affine parameters for normalization.
norm_type (`str`, *optional*, defaults to `"layer_norm"`):
The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
final_dropout (`bool` *optional*, defaults to False):
Whether to apply a final dropout after the last feed-forward layer.
attention_type (`str`, *optional*, defaults to `"default"`):
The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
positional_embeddings (`str`, *optional*, defaults to `None`):
The type of positional embeddings to apply to.
num_positional_embeddings (`int`, *optional*, defaults to `None`):
The maximum number of positional embeddings to apply.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout=0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
double_self_attention: bool = False,
upcast_attention: bool = False,
norm_elementwise_affine: bool = True,
# 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
norm_type: str = "layer_norm",
norm_eps: float = 1e-5,
final_dropout: bool = False,
attention_type: str = "default",
positional_embeddings: Optional[str] = None,
num_positional_embeddings: Optional[int] = None,
):
super().__init__()
self.only_cross_attention = only_cross_attention
self.use_ada_layer_norm_zero = (
num_embeds_ada_norm is not None
) and norm_type == "ada_norm_zero"
self.use_ada_layer_norm = (
num_embeds_ada_norm is not None
) and norm_type == "ada_norm"
self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
self.use_layer_norm = norm_type == "layer_norm"
if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
)
if positional_embeddings and (num_positional_embeddings is None):
raise ValueError(
"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
)
if positional_embeddings == "sinusoidal":
self.pos_embed = SinusoidalPositionalEmbedding(
dim, max_seq_length=num_positional_embeddings
)
else:
self.pos_embed = None
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
if self.use_ada_layer_norm:
self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
elif self.use_ada_layer_norm_zero:
self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
else:
self.norm1 = nn.LayerNorm(
dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
)
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim if only_cross_attention else None,
upcast_attention=upcast_attention,
)
# 2. Cross-Attn
if cross_attention_dim is not None or double_self_attention:
# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
# the second cross attention block.
self.norm2 = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(
dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
)
)
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=(
cross_attention_dim if not double_self_attention else None
),
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
) # is self-attn if encoder_hidden_states is none
else:
self.norm2 = None
self.attn2 = None
# 3. Feed-forward
if not self.use_ada_layer_norm_single:
self.norm3 = nn.LayerNorm(
dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
)
self.ff = FeedForward(
dim,
dropout=dropout,
activation_fn=activation_fn,
final_dropout=final_dropout,
)
# 4. Fuser
if attention_type in {"gated", "gated-text-image"}: # Updated line
self.fuser = GatedSelfAttentionDense(
dim, cross_attention_dim, num_attention_heads, attention_head_dim
)
# 5. Scale-shift for PixArt-Alpha.
if self.use_ada_layer_norm_single:
self.scale_shift_table = nn.Parameter(
torch.randn(6, dim) / dim**0.5)
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
"""
Sets the chunk size for feed-forward processing in the transformer block.
Args:
chunk_size (Optional[int]): The size of the chunks to process in feed-forward layers.
If None, the chunk size is set to the maximum possible value.
dim (int, optional): The dimension along which to split the input tensor into chunks. Defaults to 0.
Returns:
None.
"""
self._chunk_size = chunk_size
self._chunk_dim = dim
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
timestep: Optional[torch.LongTensor] = None,
cross_attention_kwargs: Dict[str, Any] = None,
class_labels: Optional[torch.LongTensor] = None,
) -> torch.FloatTensor:
"""
This function defines the forward pass of the BasicTransformerBlock.
Args:
self (BasicTransformerBlock):
An instance of the BasicTransformerBlock class.
hidden_states (torch.FloatTensor):
A tensor containing the hidden states.
attention_mask (Optional[torch.FloatTensor], optional):
A tensor containing the attention mask. Defaults to None.
encoder_hidden_states (Optional[torch.FloatTensor], optional):
A tensor containing the encoder hidden states. Defaults to None.
encoder_attention_mask (Optional[torch.FloatTensor], optional):
A tensor containing the encoder attention mask. Defaults to None.
timestep (Optional[torch.LongTensor], optional):
A tensor containing the timesteps. Defaults to None.
cross_attention_kwargs (Dict[str, Any], optional):
Additional cross-attention arguments. Defaults to None.
class_labels (Optional[torch.LongTensor], optional):
A tensor containing the class labels. Defaults to None.
Returns:
torch.FloatTensor:
A tensor containing the transformed hidden states.
"""
# Notice that normalization is always applied before the real computation in the following blocks.
# 0. Self-Attention
batch_size = hidden_states.shape[0]
gate_msa = None
scale_mlp = None
shift_mlp = None
gate_mlp = None
if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep)
elif self.use_ada_layer_norm_zero:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
)
elif self.use_layer_norm:
norm_hidden_states = self.norm1(hidden_states)
elif self.use_ada_layer_norm_single:
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
self.scale_shift_table[None] +
timestep.reshape(batch_size, 6, -1)
).chunk(6, dim=1)
norm_hidden_states = self.norm1(hidden_states)
norm_hidden_states = norm_hidden_states * \
(1 + scale_msa) + shift_msa
norm_hidden_states = norm_hidden_states.squeeze(1)
else:
raise ValueError("Incorrect norm used")
if self.pos_embed is not None:
norm_hidden_states = self.pos_embed(norm_hidden_states)
# 1. Retrieve lora scale.
lora_scale = (
cross_attention_kwargs.get("scale", 1.0)
if cross_attention_kwargs is not None
else 1.0
)
# 2. Prepare GLIGEN inputs
cross_attention_kwargs = (
cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
)
gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=(
encoder_hidden_states if self.only_cross_attention else None
),
attention_mask=attention_mask,
**cross_attention_kwargs,
)
if self.use_ada_layer_norm_zero:
attn_output = gate_msa.unsqueeze(1) * attn_output
elif self.use_ada_layer_norm_single:
attn_output = gate_msa * attn_output
hidden_states = attn_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
# 2.5 GLIGEN Control
if gligen_kwargs is not None:
hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
# 3. Cross-Attention
if self.attn2 is not None:
if self.use_ada_layer_norm:
norm_hidden_states = self.norm2(hidden_states, timestep)
elif self.use_ada_layer_norm_zero or self.use_layer_norm:
norm_hidden_states = self.norm2(hidden_states)
elif self.use_ada_layer_norm_single:
# For PixArt norm2 isn't applied here:
# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
norm_hidden_states = hidden_states
else:
raise ValueError("Incorrect norm")
if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
norm_hidden_states = self.pos_embed(norm_hidden_states)
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
**cross_attention_kwargs,
)
hidden_states = attn_output + hidden_states
# 4. Feed-forward
if not self.use_ada_layer_norm_single:
norm_hidden_states = self.norm3(hidden_states)
if self.use_ada_layer_norm_zero:
norm_hidden_states = (
norm_hidden_states *
(1 + scale_mlp[:, None]) + shift_mlp[:, None]
)
if self.use_ada_layer_norm_single:
norm_hidden_states = self.norm2(hidden_states)
norm_hidden_states = norm_hidden_states * \
(1 + scale_mlp) + shift_mlp
ff_output = self.ff(norm_hidden_states, scale=lora_scale)
if self.use_ada_layer_norm_zero:
ff_output = gate_mlp.unsqueeze(1) * ff_output
elif self.use_ada_layer_norm_single:
ff_output = gate_mlp * ff_output
hidden_states = ff_output + hidden_states
if hidden_states.ndim == 4:
hidden_states = hidden_states.squeeze(1)
return hidden_states
class TemporalBasicTransformerBlock(nn.Module):
"""
A PyTorch module that extends the BasicTransformerBlock to include temporal attention mechanisms.
This class is particularly useful for video-related tasks where capturing temporal information within the sequence of frames is necessary.
Attributes:
dim (int): The dimension of the input and output embeddings.
num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
attention_head_dim (int): The dimension of each attention head.
dropout (float): The dropout probability for the attention scores.
cross_attention_dim (Optional[int]): The dimension of the cross-attention mechanism.
activation_fn (str): The activation function used in the feed-forward layer.
num_embeds_ada_norm (Optional[int]): The number of embeddings for adaptive normalization.
attention_bias (bool): If True, uses bias in the attention mechanism.
only_cross_attention (bool): If True, only uses cross-attention.
upcast_attention (bool): If True, upcasts the attention mechanism for better performance.
unet_use_cross_frame_attention (Optional[bool]): If True, uses cross-frame attention in the UNet model.
unet_use_temporal_attention (Optional[bool]): If True, uses temporal attention in the UNet model.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout=0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
unet_use_cross_frame_attention=None,
unet_use_temporal_attention=None,
):
"""
The TemporalBasicTransformerBlock class is a PyTorch module that extends the BasicTransformerBlock to include temporal attention mechanisms.
This is particularly useful for video-related tasks, where the model needs to capture the temporal information within the sequence of frames.
The block consists of self-attention, cross-attention, feed-forward, and temporal attention mechanisms.
dim (int): The dimension of the input and output embeddings.
num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
attention_head_dim (int): The dimension of each attention head.
dropout (float, optional): The dropout probability for the attention scores. Defaults to 0.0.
cross_attention_dim (int, optional): The dimension of the cross-attention mechanism. Defaults to None.
activation_fn (str, optional): The activation function used in the feed-forward layer. Defaults to "geglu".
num_embeds_ada_norm (int, optional): The number of embeddings for adaptive normalization. Defaults to None.
attention_bias (bool, optional): If True, uses bias in the attention mechanism. Defaults to False.
only_cross_attention (bool, optional): If True, only uses cross-attention. Defaults to False.
upcast_attention (bool, optional): If True, upcasts the attention mechanism for better performance. Defaults to False.
unet_use_cross_frame_attention (bool, optional): If True, uses cross-frame attention in the UNet model. Defaults to None.
unet_use_temporal_attention (bool, optional): If True, uses temporal attention in the UNet model. Defaults to None.
Forward method:
hidden_states (torch.FloatTensor): The input hidden states.
encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states. Defaults to None.
timestep (torch.LongTensor, optional): The current timestep for the transformer model. Defaults to None.
attention_mask (torch.FloatTensor, optional): The attention mask for the self-attention mechanism. Defaults to None.
video_length (int, optional): The length of the video sequence. Defaults to None.
Returns:
torch.FloatTensor: The output hidden states after passing through the TemporalBasicTransformerBlock.
"""
super().__init__()
self.only_cross_attention = only_cross_attention
self.use_ada_layer_norm = num_embeds_ada_norm is not None
self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
self.unet_use_temporal_attention = unet_use_temporal_attention
# SC-Attn
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.norm1 = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(dim)
)
# Cross-Attn
if cross_attention_dim is not None:
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
else:
self.attn2 = None
if cross_attention_dim is not None:
self.norm2 = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(dim)
)
else:
self.norm2 = None
# Feed-forward
self.ff = FeedForward(dim, dropout=dropout,
activation_fn=activation_fn)
self.norm3 = nn.LayerNorm(dim)
self.use_ada_layer_norm_zero = False
# Temp-Attn
# assert unet_use_temporal_attention is not None
if unet_use_temporal_attention is None:
unet_use_temporal_attention = False
if unet_use_temporal_attention:
self.attn_temp = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
self.norm_temp = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(dim)
)
def forward(
self,
hidden_states,
encoder_hidden_states=None,
timestep=None,
attention_mask=None,
video_length=None,
):
"""
Forward pass for the TemporalBasicTransformerBlock.
Args:
hidden_states (torch.FloatTensor): The input hidden states with shape (batch_size, seq_len, dim).
encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states with shape (batch_size, src_seq_len, dim).
timestep (torch.LongTensor, optional): The timestep for the transformer block.
attention_mask (torch.FloatTensor, optional): The attention mask with shape (batch_size, seq_len, seq_len).
video_length (int, optional): The length of the video sequence.
Returns:
torch.FloatTensor: The output tensor after passing through the transformer block with shape (batch_size, seq_len, dim).
"""
norm_hidden_states = (
self.norm1(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm1(hidden_states)
)
if self.unet_use_cross_frame_attention:
hidden_states = (
self.attn1(
norm_hidden_states,
attention_mask=attention_mask,
video_length=video_length,
)
+ hidden_states
)
else:
hidden_states = (
self.attn1(norm_hidden_states, attention_mask=attention_mask)
+ hidden_states
)
if self.attn2 is not None:
# Cross-Attention
norm_hidden_states = (
self.norm2(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm2(hidden_states)
)
hidden_states = (
self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
)
+ hidden_states
)
# Feed-forward
hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
# Temporal-Attention
if self.unet_use_temporal_attention:
d = hidden_states.shape[1]
hidden_states = rearrange(
hidden_states, "(b f) d c -> (b d) f c", f=video_length
)
norm_hidden_states = (
self.norm_temp(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm_temp(hidden_states)
)
hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
hidden_states = rearrange(
hidden_states, "(b d) f c -> (b f) d c", d=d)
return hidden_states
class AudioTemporalBasicTransformerBlock(nn.Module):
"""
A PyTorch module designed to handle audio data within a transformer framework, including temporal attention mechanisms.
Attributes:
dim (int): The dimension of the input and output embeddings.
num_attention_heads (int): The number of attention heads.
attention_head_dim (int): The dimension of each attention head.
dropout (float): The dropout probability.
cross_attention_dim (Optional[int]): The dimension of the cross-attention mechanism.
activation_fn (str): The activation function for the feed-forward network.
num_embeds_ada_norm (Optional[int]): The number of embeddings for adaptive normalization.
attention_bias (bool): If True, uses bias in the attention mechanism.
only_cross_attention (bool): If True, only uses cross-attention.
upcast_attention (bool): If True, upcasts the attention mechanism to float32.
unet_use_cross_frame_attention (Optional[bool]): If True, uses cross-frame attention in UNet.
unet_use_temporal_attention (Optional[bool]): If True, uses temporal attention in UNet.
depth (int): The depth of the transformer block.
unet_block_name (Optional[str]): The name of the UNet block.
stack_enable_blocks_name (Optional[List[str]]): The list of enabled blocks in the stack.
stack_enable_blocks_depth (Optional[List[int]]): The list of depths for the enabled blocks in the stack.
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
attention_head_dim: int,
dropout=0.0,
cross_attention_dim: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
attention_bias: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
unet_use_cross_frame_attention=None,
unet_use_temporal_attention=None,
depth=0,
unet_block_name=None,
stack_enable_blocks_name: Optional[List[str]] = None,
stack_enable_blocks_depth: Optional[List[int]] = None,
):
"""
Initializes the AudioTemporalBasicTransformerBlock module.
Args:
dim (int): The dimension of the input and output embeddings.
num_attention_heads (int): The number of attention heads in the multi-head self-attention mechanism.
attention_head_dim (int): The dimension of each attention head.
dropout (float, optional): The dropout probability for the attention mechanism. Defaults to 0.0.
cross_attention_dim (Optional[int], optional): The dimension of the cross-attention mechanism. Defaults to None.
activation_fn (str, optional): The activation function to be used in the feed-forward network. Defaults to "geglu".
num_embeds_ada_norm (Optional[int], optional): The number of embeddings for adaptive normalization. Defaults to None.
attention_bias (bool, optional): If True, uses bias in the attention mechanism. Defaults to False.
only_cross_attention (bool, optional): If True, only uses cross-attention. Defaults to False.
upcast_attention (bool, optional): If True, upcasts the attention mechanism to float32. Defaults to False.
unet_use_cross_frame_attention (Optional[bool], optional): If True, uses cross-frame attention in UNet. Defaults to None.
unet_use_temporal_attention (Optional[bool], optional): If True, uses temporal attention in UNet. Defaults to None.
depth (int, optional): The depth of the transformer block. Defaults to 0.
unet_block_name (Optional[str], optional): The name of the UNet block. Defaults to None.
stack_enable_blocks_name (Optional[List[str]], optional): The list of enabled blocks in the stack. Defaults to None.
stack_enable_blocks_depth (Optional[List[int]], optional): The list of depths for the enabled blocks in the stack. Defaults to None.
"""
super().__init__()
self.only_cross_attention = only_cross_attention
self.use_ada_layer_norm = num_embeds_ada_norm is not None
self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
self.unet_use_temporal_attention = unet_use_temporal_attention
self.unet_block_name = unet_block_name
self.depth = depth
zero_conv_full = nn.Conv2d(
dim, dim, kernel_size=1)
self.zero_conv_full = zero_module(zero_conv_full)
zero_conv_face = nn.Conv2d(
dim, dim, kernel_size=1)
self.zero_conv_face = zero_module(zero_conv_face)
zero_conv_lip = nn.Conv2d(
dim, dim, kernel_size=1)
self.zero_conv_lip = zero_module(zero_conv_lip)
# SC-Attn
self.attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.norm1 = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(dim)
)
# Cross-Attn
if cross_attention_dim is not None:
if (stack_enable_blocks_name is not None and
stack_enable_blocks_depth is not None and
self.unet_block_name in stack_enable_blocks_name and
self.depth in stack_enable_blocks_depth):
self.attn2_0 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.attn2_1 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.attn2_2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.attn2 = None
else:
self.attn2 = Attention(
query_dim=dim,
cross_attention_dim=cross_attention_dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
upcast_attention=upcast_attention,
)
self.attn2_0=None
else:
self.attn2 = None
self.attn2_0 = None
if cross_attention_dim is not None:
self.norm2 = (
AdaLayerNorm(dim, num_embeds_ada_norm)
if self.use_ada_layer_norm
else nn.LayerNorm(dim)
)
else:
self.norm2 = None
# Feed-forward
self.ff = FeedForward(dim, dropout=dropout,
activation_fn=activation_fn)
self.norm3 = nn.LayerNorm(dim)
self.use_ada_layer_norm_zero = False
def forward(
self,
hidden_states,
encoder_hidden_states=None,
timestep=None,
attention_mask=None,
full_mask=None,
face_mask=None,
lip_mask=None,
motion_scale=None,
video_length=None,
):
"""
Forward pass for the AudioTemporalBasicTransformerBlock.
Args:
hidden_states (torch.FloatTensor): The input hidden states.
encoder_hidden_states (torch.FloatTensor, optional): The encoder hidden states. Defaults to None.
timestep (torch.LongTensor, optional): The timestep for the transformer block. Defaults to None.
attention_mask (torch.FloatTensor, optional): The attention mask. Defaults to None.
full_mask (torch.FloatTensor, optional): The full mask. Defaults to None.
face_mask (torch.FloatTensor, optional): The face mask. Defaults to None.
lip_mask (torch.FloatTensor, optional): The lip mask. Defaults to None.
video_length (int, optional): The length of the video. Defaults to None.
Returns:
torch.FloatTensor: The output tensor after passing through the AudioTemporalBasicTransformerBlock.
"""
norm_hidden_states = (
self.norm1(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm1(hidden_states)
)
if self.unet_use_cross_frame_attention:
hidden_states = (
self.attn1(
norm_hidden_states,
attention_mask=attention_mask,
video_length=video_length,
)
+ hidden_states
)
else:
hidden_states = (
self.attn1(norm_hidden_states, attention_mask=attention_mask)
+ hidden_states
)
if self.attn2 is not None:
# Cross-Attention
norm_hidden_states = (
self.norm2(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm2(hidden_states)
)
hidden_states = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
) + hidden_states
elif self.attn2_0 is not None:
norm_hidden_states = (
self.norm2(hidden_states, timestep)
if self.use_ada_layer_norm
else self.norm2(hidden_states)
)
level = self.depth
full_hidden_states = (
self.attn2_0(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
) * full_mask[level][:, :, None]
)
bz, sz, c = full_hidden_states.shape
sz_sqrt = int(sz ** 0.5)
full_hidden_states = full_hidden_states.reshape(
bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
full_hidden_states = self.zero_conv_full(full_hidden_states).permute(0, 2, 3, 1).reshape(bz, -1, c)
face_hidden_state = (
self.attn2_1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
) * face_mask[level][:, :, None]
)
face_hidden_state = face_hidden_state.reshape(
bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
face_hidden_state = self.zero_conv_face(
face_hidden_state).permute(0, 2, 3, 1).reshape(bz, -1, c)
lip_hidden_state = (
self.attn2_2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask,
) * lip_mask[level][:, :, None]
) # [32, 4096, 320]
lip_hidden_state = lip_hidden_state.reshape(
bz, sz_sqrt, sz_sqrt, c).permute(0, 3, 1, 2)
lip_hidden_state = self.zero_conv_lip(
lip_hidden_state).permute(0, 2, 3, 1).reshape(bz, -1, c)
if motion_scale is not None:
hidden_states = (
motion_scale[0] * full_hidden_states +
motion_scale[1] * face_hidden_state +
motion_scale[2] * lip_hidden_state + hidden_states
)
else:
hidden_states = (
full_hidden_states +
face_hidden_state +
lip_hidden_state + hidden_states
)
# Feed-forward
hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
return hidden_states
def zero_module(module):
"""
Zeroes out the parameters of a given module.
Args:
module (nn.Module): The module whose parameters need to be zeroed out.
Returns:
None.
"""
for p in module.parameters():
nn.init.zeros_(p)
return module