Spaces:

LanguageBind
/

Open-Sora-Plan-v1.0.0

Runtime error

Open-Sora-Plan-v1.0.0 / opensora /models /diffusion /latte /modules.py

LinB203

a220803 6 months ago

71.7 kB

	from importlib import import_module

	import numpy as np
	import torch

	import os
	import json

	from dataclasses import dataclass
	from einops import rearrange, repeat
	from typing import Any, Dict, Optional, Tuple, Callable
	from diffusers.models import Transformer2DModel
	from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_xformers_available
	from diffusers.models.embeddings import get_1d_sincos_pos_embed_from_grid, ImagePositionalEmbeddings, CaptionProjection, \
	PatchEmbed, CombinedTimestepSizeEmbeddings
	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.attention import BasicTransformerBlock
	from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear

	import torch
	import torch.nn.functional as F
	from torch import nn
	from diffusers.utils.torch_utils import maybe_allow_in_graph
	from diffusers.models.embeddings import SinusoidalPositionalEmbedding
	from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero
	from diffusers.models.attention_processor import SpatialNorm, LORA_ATTENTION_PROCESSORS, \
	CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0, \
	AttnAddedKVProcessor, AttnAddedKVProcessor2_0, SlicedAttnAddedKVProcessor, XFormersAttnAddedKVProcessor, \
	LoRAAttnAddedKVProcessor, LoRAXFormersAttnProcessor, XFormersAttnProcessor, LoRAAttnProcessor2_0, LoRAAttnProcessor, \
	AttnProcessor, SlicedAttnProcessor, logger
	from diffusers.models.activations import GEGLU, GELU, ApproximateGELU

	from dataclasses import dataclass

	from torch import nn

	from opensora.models.diffusion.utils.pos_embed import get_2d_sincos_pos_embed

	if is_xformers_available():
	import xformers
	import xformers.ops
	else:
	xformers = None


	class PatchEmbed(nn.Module):
	"""2D Image to Patch Embedding"""

	def __init__(
	self,
	height=224,
	width=224,
	patch_size=16,
	in_channels=3,
	embed_dim=768,
	layer_norm=False,
	flatten=True,
	bias=True,
	interpolation_scale=1,
	):
	super().__init__()

	num_patches = (height // patch_size) * (width // patch_size)
	self.flatten = flatten
	self.layer_norm = layer_norm

	self.proj = nn.Conv2d(
	in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
	)
	if layer_norm:
	self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
	else:
	self.norm = None

	self.patch_size = patch_size
	# See:
	# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
	self.height, self.width = height // patch_size, width // patch_size
	self.base_size = height // patch_size
	self.interpolation_scale = interpolation_scale
	pos_embed = get_2d_sincos_pos_embed(
	embed_dim, int(num_patches**0.5), base_size=self.base_size, interpolation_scale=self.interpolation_scale
	)
	self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)

	def forward(self, latent):
	height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size

	latent = self.proj(latent)
	if self.flatten:
	latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC
	if self.layer_norm:
	latent = self.norm(latent)

	# Interpolate positional embeddings if needed.
	# (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
	if self.height != height or self.width != width:
	pos_embed = get_2d_sincos_pos_embed(
	embed_dim=self.pos_embed.shape[-1],
	grid_size=(height, width),
	base_size=self.base_size,
	interpolation_scale=self.interpolation_scale,
	)
	pos_embed = torch.from_numpy(pos_embed)
	pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
	else:
	pos_embed = self.pos_embed

	return (latent + pos_embed).to(latent.dtype)


	@maybe_allow_in_graph
	class Attention(nn.Module):
	r"""
	A cross attention layer.

	Parameters:
	query_dim (`int`):
	The number of channels in the query.
	cross_attention_dim (`int`, optional):
	The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
	heads (`int`, optional, defaults to 8):
	The number of heads to use for multi-head attention.
	dim_head (`int`, optional, defaults to 64):
	The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0):
	The dropout probability to use.
	bias (`bool`, optional, defaults to False):
	Set to `True` for the query, key, and value linear layers to contain a bias parameter.
	upcast_attention (`bool`, optional, defaults to False):
	Set to `True` to upcast the attention computation to `float32`.
	upcast_softmax (`bool`, optional, defaults to False):
	Set to `True` to upcast the softmax computation to `float32`.
	cross_attention_norm (`str`, optional, defaults to `None`):
	The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
	cross_attention_norm_num_groups (`int`, optional, defaults to 32):
	The number of groups to use for the group norm in the cross attention.
	added_kv_proj_dim (`int`, optional, defaults to `None`):
	The number of channels to use for the added key and value projections. If `None`, no projection is used.
	norm_num_groups (`int`, optional, defaults to `None`):
	The number of groups to use for the group norm in the attention.
	spatial_norm_dim (`int`, optional, defaults to `None`):
	The number of channels to use for the spatial normalization.
	out_bias (`bool`, optional, defaults to `True`):
	Set to `True` to use a bias in the output linear layer.
	scale_qk (`bool`, optional, defaults to `True`):
	Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
	only_cross_attention (`bool`, optional, defaults to `False`):
	Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
	`added_kv_proj_dim` is not `None`.
	eps (`float`, optional, defaults to 1e-5):
	An additional value added to the denominator in group normalization that is used for numerical stability.
	rescale_output_factor (`float`, optional, defaults to 1.0):
	A factor to rescale the output by dividing it with this value.
	residual_connection (`bool`, optional, defaults to `False`):
	Set to `True` to add the residual connection to the output.
	_from_deprecated_attn_block (`bool`, optional, defaults to `False`):
	Set to `True` if the attention block is loaded from a deprecated state dict.
	processor (`AttnProcessor`, optional, defaults to `None`):
	The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
	`AttnProcessor` otherwise.
	"""

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: Optional[int] = None,
	heads: int = 8,
	dim_head: int = 64,
	dropout: float = 0.0,
	bias: bool = False,
	upcast_attention: bool = False,
	upcast_softmax: bool = False,
	cross_attention_norm: Optional[str] = None,
	cross_attention_norm_num_groups: int = 32,
	added_kv_proj_dim: Optional[int] = None,
	norm_num_groups: Optional[int] = None,
	spatial_norm_dim: Optional[int] = None,
	out_bias: bool = True,
	scale_qk: bool = True,
	only_cross_attention: bool = False,
	eps: float = 1e-5,
	rescale_output_factor: float = 1.0,
	residual_connection: bool = False,
	_from_deprecated_attn_block: bool = False,
	processor: Optional["AttnProcessor"] = None,
	attention_mode: str = 'xformers',
	):
	super().__init__()
	self.inner_dim = dim_head * heads
	self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
	self.upcast_attention = upcast_attention
	self.upcast_softmax = upcast_softmax
	self.rescale_output_factor = rescale_output_factor
	self.residual_connection = residual_connection
	self.dropout = dropout

	# we make use of this private variable to know whether this class is loaded
	# with an deprecated state dict so that we can convert it on the fly
	self._from_deprecated_attn_block = _from_deprecated_attn_block

	self.scale_qk = scale_qk
	self.scale = dim_head**-0.5 if self.scale_qk else 1.0

	self.heads = heads
	# for slice_size > 0 the attention score computation
	# is split across the batch axis to save memory
	# You can set slice_size with `set_attention_slice`
	self.sliceable_head_dim = heads

	self.added_kv_proj_dim = added_kv_proj_dim
	self.only_cross_attention = only_cross_attention

	if self.added_kv_proj_dim is None and self.only_cross_attention:
	raise ValueError(
	"`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
	)

	if norm_num_groups is not None:
	self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
	else:
	self.group_norm = None

	if spatial_norm_dim is not None:
	self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
	else:
	self.spatial_norm = None

	if cross_attention_norm is None:
	self.norm_cross = None
	elif cross_attention_norm == "layer_norm":
	self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
	elif cross_attention_norm == "group_norm":
	if self.added_kv_proj_dim is not None:
	# The given `encoder_hidden_states` are initially of shape
	# (batch_size, seq_len, added_kv_proj_dim) before being projected
	# to (batch_size, seq_len, cross_attention_dim). The norm is applied
	# before the projection, so we need to use `added_kv_proj_dim` as
	# the number of channels for the group norm.
	norm_cross_num_channels = added_kv_proj_dim
	else:
	norm_cross_num_channels = self.cross_attention_dim

	self.norm_cross = nn.GroupNorm(
	num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
	)
	else:
	raise ValueError(
	f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
	)

	if USE_PEFT_BACKEND:
	linear_cls = nn.Linear
	else:
	linear_cls = LoRACompatibleLinear

	self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)

	if not self.only_cross_attention:
	# only relevant for the `AddedKVProcessor` classes
	self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
	self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
	else:
	self.to_k = None
	self.to_v = None

	if self.added_kv_proj_dim is not None:
	self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
	self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)

	self.to_out = nn.ModuleList([])
	self.to_out.append(linear_cls(self.inner_dim, query_dim, bias=out_bias))
	self.to_out.append(nn.Dropout(dropout))

	# set attention processor
	# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
	# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
	# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
	if processor is None:
	processor = (
	AttnProcessor2_0(attention_mode) if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
	)
	self.set_processor(processor)

	def set_use_memory_efficient_attention_xformers(
	self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
	) -> None:
	r"""
	Set whether to use memory efficient attention from `xformers` or not.

	Args:
	use_memory_efficient_attention_xformers (`bool`):
	Whether to use memory efficient attention from `xformers` or not.
	attention_op (`Callable`, optional):
	The attention operation to use. Defaults to `None` which uses the default attention operation from
	`xformers`.
	"""
	is_lora = hasattr(self, "processor") and isinstance(
	self.processor,
	LORA_ATTENTION_PROCESSORS,
	)
	is_custom_diffusion = hasattr(self, "processor") and isinstance(
	self.processor,
	(CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
	)
	is_added_kv_processor = hasattr(self, "processor") and isinstance(
	self.processor,
	(
	AttnAddedKVProcessor,
	AttnAddedKVProcessor2_0,
	SlicedAttnAddedKVProcessor,
	XFormersAttnAddedKVProcessor,
	LoRAAttnAddedKVProcessor,
	),
	)

	if use_memory_efficient_attention_xformers:
	if is_added_kv_processor and (is_lora or is_custom_diffusion):
	raise NotImplementedError(
	f"Memory efficient attention is currently not supported for LoRA or custom diffusion for attention processor type {self.processor}"
	)
	if not is_xformers_available():
	raise ModuleNotFoundError(
	(
	"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
	" xformers"
	),
	name="xformers",
	)
	elif not torch.cuda.is_available():
	raise ValueError(
	"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
	" only available for GPU "
	)
	else:
	try:
	# Make sure we can run the memory efficient attention
	_ = xformers.ops.memory_efficient_attention(
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	)
	except Exception as e:
	raise e

	if is_lora:
	# TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
	# variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
	processor = LoRAXFormersAttnProcessor(
	hidden_size=self.processor.hidden_size,
	cross_attention_dim=self.processor.cross_attention_dim,
	rank=self.processor.rank,
	attention_op=attention_op,
	)
	processor.load_state_dict(self.processor.state_dict())
	processor.to(self.processor.to_q_lora.up.weight.device)
	elif is_custom_diffusion:
	processor = CustomDiffusionXFormersAttnProcessor(
	train_kv=self.processor.train_kv,
	train_q_out=self.processor.train_q_out,
	hidden_size=self.processor.hidden_size,
	cross_attention_dim=self.processor.cross_attention_dim,
	attention_op=attention_op,
	)
	processor.load_state_dict(self.processor.state_dict())
	if hasattr(self.processor, "to_k_custom_diffusion"):
	processor.to(self.processor.to_k_custom_diffusion.weight.device)
	elif is_added_kv_processor:
	# TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
	# which uses this type of cross attention ONLY because the attention mask of format
	# [0, ..., -10.000, ..., 0, ...,] is not supported
	# throw warning
	logger.info(
	"Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
	)
	processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
	else:
	processor = XFormersAttnProcessor(attention_op=attention_op)
	else:
	if is_lora:
	attn_processor_class = (
	LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
	)
	processor = attn_processor_class(
	hidden_size=self.processor.hidden_size,
	cross_attention_dim=self.processor.cross_attention_dim,
	rank=self.processor.rank,
	)
	processor.load_state_dict(self.processor.state_dict())
	processor.to(self.processor.to_q_lora.up.weight.device)
	elif is_custom_diffusion:
	attn_processor_class = (
	CustomDiffusionAttnProcessor2_0
	if hasattr(F, "scaled_dot_product_attention")
	else CustomDiffusionAttnProcessor
	)
	processor = attn_processor_class(
	train_kv=self.processor.train_kv,
	train_q_out=self.processor.train_q_out,
	hidden_size=self.processor.hidden_size,
	cross_attention_dim=self.processor.cross_attention_dim,
	)
	processor.load_state_dict(self.processor.state_dict())
	if hasattr(self.processor, "to_k_custom_diffusion"):
	processor.to(self.processor.to_k_custom_diffusion.weight.device)
	else:
	# set attention processor
	# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
	# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
	# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
	processor = (
	AttnProcessor2_0()
	if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
	else AttnProcessor()
	)

	self.set_processor(processor)

	def set_attention_slice(self, slice_size: int) -> None:
	r"""
	Set the slice size for attention computation.

	Args:
	slice_size (`int`):
	The slice size for attention computation.
	"""
	if slice_size is not None and slice_size > self.sliceable_head_dim:
	raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")

	if slice_size is not None and self.added_kv_proj_dim is not None:
	processor = SlicedAttnAddedKVProcessor(slice_size)
	elif slice_size is not None:
	processor = SlicedAttnProcessor(slice_size)
	elif self.added_kv_proj_dim is not None:
	processor = AttnAddedKVProcessor()
	else:
	# set attention processor
	# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
	# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
	# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
	processor = (
	AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
	)

	self.set_processor(processor)

	def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False) -> None:
	r"""
	Set the attention processor to use.

	Args:
	processor (`AttnProcessor`):
	The attention processor to use.
	_remove_lora (`bool`, optional, defaults to `False`):
	Set to `True` to remove LoRA layers from the model.
	"""
	if not USE_PEFT_BACKEND and hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None:
	deprecate(
	"set_processor to offload LoRA",
	"0.26.0",
	"In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
	)
	# TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
	# We need to remove all LoRA layers
	# Don't forget to remove ALL `_remove_lora` from the codebase
	for module in self.modules():
	if hasattr(module, "set_lora_layer"):
	module.set_lora_layer(None)

	# if current processor is in `self._modules` and if passed `processor` is not, we need to
	# pop `processor` from `self._modules`
	if (
	hasattr(self, "processor")
	and isinstance(self.processor, torch.nn.Module)
	and not isinstance(processor, torch.nn.Module)
	):
	logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
	self._modules.pop("processor")

	self.processor = processor

	def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
	r"""
	Get the attention processor in use.

	Args:
	return_deprecated_lora (`bool`, optional, defaults to `False`):
	Set to `True` to return the deprecated LoRA attention processor.

	Returns:
	"AttentionProcessor": The attention processor in use.
	"""
	if not return_deprecated_lora:
	return self.processor

	# TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
	# serialization format for LoRA Attention Processors. It should be deleted once the integration
	# with PEFT is completed.
	is_lora_activated = {
	name: module.lora_layer is not None
	for name, module in self.named_modules()
	if hasattr(module, "lora_layer")
	}

	# 1. if no layer has a LoRA activated we can return the processor as usual
	if not any(is_lora_activated.values()):
	return self.processor

	# If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
	is_lora_activated.pop("add_k_proj", None)
	is_lora_activated.pop("add_v_proj", None)
	# 2. else it is not posssible that only some layers have LoRA activated
	if not all(is_lora_activated.values()):
	raise ValueError(
	f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
	)

	# 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
	non_lora_processor_cls_name = self.processor.__class__.__name__
	lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)

	hidden_size = self.inner_dim

	# now create a LoRA attention processor from the LoRA layers
	if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]:
	kwargs = {
	"cross_attention_dim": self.cross_attention_dim,
	"rank": self.to_q.lora_layer.rank,
	"network_alpha": self.to_q.lora_layer.network_alpha,
	"q_rank": self.to_q.lora_layer.rank,
	"q_hidden_size": self.to_q.lora_layer.out_features,
	"k_rank": self.to_k.lora_layer.rank,
	"k_hidden_size": self.to_k.lora_layer.out_features,
	"v_rank": self.to_v.lora_layer.rank,
	"v_hidden_size": self.to_v.lora_layer.out_features,
	"out_rank": self.to_out[0].lora_layer.rank,
	"out_hidden_size": self.to_out[0].lora_layer.out_features,
	}

	if hasattr(self.processor, "attention_op"):
	kwargs["attention_op"] = self.processor.attention_op

	lora_processor = lora_processor_cls(hidden_size, **kwargs)
	lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
	lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
	lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
	lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
	elif lora_processor_cls == LoRAAttnAddedKVProcessor:
	lora_processor = lora_processor_cls(
	hidden_size,
	cross_attention_dim=self.add_k_proj.weight.shape[0],
	rank=self.to_q.lora_layer.rank,
	network_alpha=self.to_q.lora_layer.network_alpha,
	)
	lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
	lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
	lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
	lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())

	# only save if used
	if self.add_k_proj.lora_layer is not None:
	lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict())
	lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict())
	else:
	lora_processor.add_k_proj_lora = None
	lora_processor.add_v_proj_lora = None
	else:
	raise ValueError(f"{lora_processor_cls} does not exist.")

	return lora_processor

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	**cross_attention_kwargs,
	) -> torch.Tensor:
	r"""
	The forward method of the `Attention` class.

	Args:
	hidden_states (`torch.Tensor`):
	The hidden states of the query.
	encoder_hidden_states (`torch.Tensor`, optional):
	The hidden states of the encoder.
	attention_mask (`torch.Tensor`, optional):
	The attention mask to use. If `None`, no mask is applied.
	**cross_attention_kwargs:
	Additional keyword arguments to pass along to the cross attention.

	Returns:
	`torch.Tensor`: The output of the attention layer.
	"""
	# The `Attention` class can call different attention processors / attention functions
	# here we simply pass along all tensors to the selected processor class
	# For standard processors that are defined here, `**cross_attention_kwargs` is empty
	return self.processor(
	self,
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)

	def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
	r"""
	Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
	is the number of heads initialized while constructing the `Attention` class.

	Args:
	tensor (`torch.Tensor`): The tensor to reshape.

	Returns:
	`torch.Tensor`: The reshaped tensor.
	"""
	head_size = self.heads
	batch_size, seq_len, dim = tensor.shape
	tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
	tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
	return tensor

	def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
	r"""
	Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
	the number of heads initialized while constructing the `Attention` class.

	Args:
	tensor (`torch.Tensor`): The tensor to reshape.
	out_dim (`int`, optional, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
	reshaped to `[batch_size * heads, seq_len, dim // heads]`.

	Returns:
	`torch.Tensor`: The reshaped tensor.
	"""
	head_size = self.heads
	batch_size, seq_len, dim = tensor.shape
	tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
	tensor = tensor.permute(0, 2, 1, 3)

	if out_dim == 3:
	tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)

	return tensor

	def get_attention_scores(
	self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
	) -> torch.Tensor:
	r"""
	Compute the attention scores.

	Args:
	query (`torch.Tensor`): The query tensor.
	key (`torch.Tensor`): The key tensor.
	attention_mask (`torch.Tensor`, optional): The attention mask to use. If `None`, no mask is applied.

	Returns:
	`torch.Tensor`: The attention probabilities/scores.
	"""
	dtype = query.dtype
	if self.upcast_attention:
	query = query.float()
	key = key.float()

	if attention_mask is None:
	baddbmm_input = torch.empty(
	query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
	)
	beta = 0
	else:
	baddbmm_input = attention_mask
	beta = 1

	attention_scores = torch.baddbmm(
	baddbmm_input,
	query,
	key.transpose(-1, -2),
	beta=beta,
	alpha=self.scale,
	)
	del baddbmm_input

	if self.upcast_softmax:
	attention_scores = attention_scores.float()

	attention_probs = attention_scores.softmax(dim=-1)
	del attention_scores

	attention_probs = attention_probs.to(dtype)

	return attention_probs

	def prepare_attention_mask(
	self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
	) -> torch.Tensor:
	r"""
	Prepare the attention mask for the attention computation.

	Args:
	attention_mask (`torch.Tensor`):
	The attention mask to prepare.
	target_length (`int`):
	The target length of the attention mask. This is the length of the attention mask after padding.
	batch_size (`int`):
	The batch size, which is used to repeat the attention mask.
	out_dim (`int`, optional, defaults to `3`):
	The output dimension of the attention mask. Can be either `3` or `4`.

	Returns:
	`torch.Tensor`: The prepared attention mask.
	"""
	head_size = self.heads
	if attention_mask is None:
	return attention_mask

	current_length: int = attention_mask.shape[-1]
	if current_length != target_length:
	if attention_mask.device.type == "mps":
	# HACK: MPS: Does not support padding by greater than dimension of input tensor.
	# Instead, we can manually construct the padding tensor.
	padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
	padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
	attention_mask = torch.cat([attention_mask, padding], dim=2)
	else:
	# TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
	# we want to instead pad by (0, remaining_length), where remaining_length is:
	# remaining_length: int = target_length - current_length
	# TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
	attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)

	if out_dim == 3:
	if attention_mask.shape[0] < batch_size * head_size:
	attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
	elif out_dim == 4:
	attention_mask = attention_mask.unsqueeze(1)
	attention_mask = attention_mask.repeat_interleave(head_size, dim=1)

	return attention_mask

	def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
	r"""
	Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
	`Attention` class.

	Args:
	encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.

	Returns:
	`torch.Tensor`: The normalized encoder hidden states.
	"""
	assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"

	if isinstance(self.norm_cross, nn.LayerNorm):
	encoder_hidden_states = self.norm_cross(encoder_hidden_states)
	elif isinstance(self.norm_cross, nn.GroupNorm):
	# Group norm norms along the channels dimension and expects
	# input to be in the shape of (N, C, *). In this case, we want
	# to norm along the hidden dimension, so we need to move
	# (batch_size, sequence_length, hidden_size) ->
	# (batch_size, hidden_size, sequence_length)
	encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
	encoder_hidden_states = self.norm_cross(encoder_hidden_states)
	encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
	else:
	assert False

	return encoder_hidden_states

	class AttnProcessor2_0:
	r"""
	Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
	"""

	def __init__(self, attention_mode='xformers'):
	self.attention_mode = attention_mode
	if not hasattr(F, "scaled_dot_product_attention"):
	raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	temb: Optional[torch.FloatTensor] = None,
	scale: float = 1.0,
	) -> torch.FloatTensor:
	residual = hidden_states

	args = () if USE_PEFT_BACKEND else (scale,)

	if attn.spatial_norm is not None:
	hidden_states = attn.spatial_norm(hidden_states, temb)

	input_ndim = hidden_states.ndim

	if input_ndim == 4:
	batch_size, channel, height, width = hidden_states.shape
	hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

	batch_size, sequence_length, _ = (
	hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
	)

	if attention_mask is not None:
	attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
	# scaled_dot_product_attention expects attention_mask shape to be
	# (batch, heads, source_length, target_length)
	attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])

	if attn.group_norm is not None:
	hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

	args = () if USE_PEFT_BACKEND else (scale,)
	query = attn.to_q(hidden_states, *args)

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif attn.norm_cross:
	encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

	key = attn.to_k(encoder_hidden_states, *args)
	value = attn.to_v(encoder_hidden_states, *args)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	# TODO: add support for attn.scale when we move to Torch 2.1
	if self.attention_mode == 'flash':
	assert attention_mask is None or torch.all(attention_mask.bool()), 'flash-attn do not support attention_mask'
	with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
	hidden_states = F.scaled_dot_product_attention(
	query, key, value, dropout_p=0.0, is_causal=False
	)
	elif self.attention_mode == 'xformers':
	with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
	hidden_states = F.scaled_dot_product_attention(
	query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
	)
	elif self.attention_mode == 'math':
	hidden_states = F.scaled_dot_product_attention(
	query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
	)
	else:
	raise NotImplementedError(f'Found attention_mode: {self.attention_mode}')
	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states, *args)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	if input_ndim == 4:
	hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

	if attn.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / attn.rescale_output_factor

	return hidden_states

	@maybe_allow_in_graph
	class GatedSelfAttentionDense(nn.Module):
	r"""
	A gated self-attention dense layer that combines visual features and object features.

	Parameters:
	query_dim (`int`): The number of channels in the query.
	context_dim (`int`): The number of channels in the context.
	n_heads (`int`): The number of heads to use for attention.
	d_head (`int`): The number of channels in each head.
	"""

	def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
	super().__init__()

	# we need a linear projection since we need cat visual feature and obj feature
	self.linear = nn.Linear(context_dim, query_dim)

	self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
	self.ff = FeedForward(query_dim, activation_fn="geglu")

	self.norm1 = nn.LayerNorm(query_dim)
	self.norm2 = nn.LayerNorm(query_dim)

	self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
	self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))

	self.enabled = True

	def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
	if not self.enabled:
	return x

	n_visual = x.shape[1]
	objs = self.linear(objs)

	x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
	x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))

	return x


	class FeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	dim (`int`): The number of channels in the input.
	dim_out (`int`, optional): The number of channels in the output. If not given, defaults to `dim`.
	mult (`int`, optional, defaults to 4): The multiplier to use for the hidden dimension.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	final_dropout (`bool` optional, defaults to False): Apply a final dropout.
	"""

	def __init__(
	self,
	dim: int,
	dim_out: Optional[int] = None,
	mult: int = 4,
	dropout: float = 0.0,
	activation_fn: str = "geglu",
	final_dropout: bool = False,
	):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = dim_out if dim_out is not None else dim
	linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear

	if activation_fn == "gelu":
	act_fn = GELU(dim, inner_dim)
	if activation_fn == "gelu-approximate":
	act_fn = GELU(dim, inner_dim, approximate="tanh")
	elif activation_fn == "geglu":
	act_fn = GEGLU(dim, inner_dim)
	elif activation_fn == "geglu-approximate":
	act_fn = ApproximateGELU(dim, inner_dim)

	self.net = nn.ModuleList([])
	# project in
	self.net.append(act_fn)
	# project dropout
	self.net.append(nn.Dropout(dropout))
	# project out
	self.net.append(linear_cls(inner_dim, dim_out))
	# FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
	if final_dropout:
	self.net.append(nn.Dropout(dropout))

	def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
	compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
	for module in self.net:
	if isinstance(module, compatible_cls):
	hidden_states = module(hidden_states, scale)
	else:
	hidden_states = module(hidden_states)
	return hidden_states


	@maybe_allow_in_graph
	class BasicTransformerBlock_(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the encoder_hidden_states vector for cross attention.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	only_cross_attention (`bool`, optional):
	Whether to use only cross-attention layers. In this case two cross attention layers are used.
	double_self_attention (`bool`, optional):
	Whether to use two self-attention layers. In this case no cross attention layers are used.
	upcast_attention (`bool`, optional):
	Whether to upcast the attention computation to float32. This is useful for mixed precision training.
	norm_elementwise_affine (`bool`, optional, defaults to `True`):
	Whether to use learnable elementwise affine parameters for normalization.
	norm_type (`str`, optional, defaults to `"layer_norm"`):
	The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
	final_dropout (`bool` optional, defaults to False):
	Whether to apply a final dropout after the last feed-forward layer.
	attention_type (`str`, optional, defaults to `"default"`):
	The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
	positional_embeddings (`str`, optional, defaults to `None`):
	The type of positional embeddings to apply to.
	num_positional_embeddings (`int`, optional, defaults to `None`):
	The maximum number of positional embeddings to apply.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	double_self_attention: bool = False,
	upcast_attention: bool = False,
	norm_elementwise_affine: bool = True,
	norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
	norm_eps: float = 1e-5,
	final_dropout: bool = False,
	attention_type: str = "default",
	positional_embeddings: Optional[str] = None,
	num_positional_embeddings: Optional[int] = None,
	attention_mode: str = "xformers",
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention

	self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
	self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
	self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
	self.use_layer_norm = norm_type == "layer_norm"

	if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
	raise ValueError(
	f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
	f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
	)

	if positional_embeddings and (num_positional_embeddings is None):
	raise ValueError(
	"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
	)

	if positional_embeddings == "sinusoidal":
	self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
	else:
	self.pos_embed = None

	# Define 3 blocks. Each block has its own normalization layer.
	# 1. Self-Attn
	if self.use_ada_layer_norm:
	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
	elif self.use_ada_layer_norm_zero:
	self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
	else:
	self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)

	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	upcast_attention=upcast_attention,
	attention_mode=attention_mode
	)

	# # 2. Cross-Attn
	# if cross_attention_dim is not None or double_self_attention:
	# # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
	# # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
	# # the second cross attention block.
	# self.norm2 = (
	# AdaLayerNorm(dim, num_embeds_ada_norm)
	# if self.use_ada_layer_norm
	# else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
	# )
	# self.attn2 = Attention(
	# query_dim=dim,
	# cross_attention_dim=cross_attention_dim if not double_self_attention else None,
	# heads=num_attention_heads,
	# dim_head=attention_head_dim,
	# dropout=dropout,
	# bias=attention_bias,
	# upcast_attention=upcast_attention,
	# ) # is self-attn if encoder_hidden_states is none
	# else:
	# self.norm2 = None
	# self.attn2 = None

	# 3. Feed-forward
	# if not self.use_ada_layer_norm_single:
	# self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
	self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)

	self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)

	# 4. Fuser
	if attention_type == "gated" or attention_type == "gated-text-image":
	self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)

	# 5. Scale-shift for PixArt-Alpha.
	if self.use_ada_layer_norm_single:
	self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim ** 0.5)

	# let chunk size default to None
	self._chunk_size = None
	self._chunk_dim = 0

	def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
	# Sets chunk feed-forward
	self._chunk_size = chunk_size
	self._chunk_dim = dim

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	timestep: Optional[torch.LongTensor] = None,
	cross_attention_kwargs: Dict[str, Any] = None,
	class_labels: Optional[torch.LongTensor] = None,
	) -> torch.FloatTensor:
	# Notice that normalization is always applied before the real computation in the following blocks.
	# 0. Self-Attention
	batch_size = hidden_states.shape[0]

	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm1(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero:
	norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
	hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
	)
	elif self.use_layer_norm:
	norm_hidden_states = self.norm1(hidden_states)
	elif self.use_ada_layer_norm_single:
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
	self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
	).chunk(6, dim=1)
	norm_hidden_states = self.norm1(hidden_states)
	norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
	norm_hidden_states = norm_hidden_states.squeeze(1)
	else:
	raise ValueError("Incorrect norm used")

	if self.pos_embed is not None:
	norm_hidden_states = self.pos_embed(norm_hidden_states)

	# 1. Retrieve lora scale.
	lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0

	# 2. Prepare GLIGEN inputs
	cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
	gligen_kwargs = cross_attention_kwargs.pop("gligen", None)

	attn_output = self.attn1(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)
	if self.use_ada_layer_norm_zero:
	attn_output = gate_msa.unsqueeze(1) * attn_output
	elif self.use_ada_layer_norm_single:
	attn_output = gate_msa * attn_output

	hidden_states = attn_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	# 2.5 GLIGEN Control
	if gligen_kwargs is not None:
	hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])

	# # 3. Cross-Attention
	# if self.attn2 is not None:
	# if self.use_ada_layer_norm:
	# norm_hidden_states = self.norm2(hidden_states, timestep)
	# elif self.use_ada_layer_norm_zero or self.use_layer_norm:
	# norm_hidden_states = self.norm2(hidden_states)
	# elif self.use_ada_layer_norm_single:
	# # For PixArt norm2 isn't applied here:
	# # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
	# norm_hidden_states = hidden_states
	# else:
	# raise ValueError("Incorrect norm")

	# if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
	# norm_hidden_states = self.pos_embed(norm_hidden_states)

	# attn_output = self.attn2(
	# norm_hidden_states,
	# encoder_hidden_states=encoder_hidden_states,
	# attention_mask=encoder_attention_mask,
	# **cross_attention_kwargs,
	# )
	# hidden_states = attn_output + hidden_states

	# 4. Feed-forward
	# if not self.use_ada_layer_norm_single:
	# norm_hidden_states = self.norm3(hidden_states)

	if self.use_ada_layer_norm_zero:
	norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]

	if self.use_ada_layer_norm_single:
	# norm_hidden_states = self.norm2(hidden_states)
	norm_hidden_states = self.norm3(hidden_states)
	norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp

	if self._chunk_size is not None:
	# "feed_forward_chunk_size" can be used to save memory
	if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
	raise ValueError(
	f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
	)

	num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
	ff_output = torch.cat(
	[
	self.ff(hid_slice, scale=lora_scale)
	for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)
	],
	dim=self._chunk_dim,
	)
	else:
	ff_output = self.ff(norm_hidden_states, scale=lora_scale)

	if self.use_ada_layer_norm_zero:
	ff_output = gate_mlp.unsqueeze(1) * ff_output
	elif self.use_ada_layer_norm_single:
	ff_output = gate_mlp * ff_output

	hidden_states = ff_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	return hidden_states


	@maybe_allow_in_graph
	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the encoder_hidden_states vector for cross attention.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	only_cross_attention (`bool`, optional):
	Whether to use only cross-attention layers. In this case two cross attention layers are used.
	double_self_attention (`bool`, optional):
	Whether to use two self-attention layers. In this case no cross attention layers are used.
	upcast_attention (`bool`, optional):
	Whether to upcast the attention computation to float32. This is useful for mixed precision training.
	norm_elementwise_affine (`bool`, optional, defaults to `True`):
	Whether to use learnable elementwise affine parameters for normalization.
	norm_type (`str`, optional, defaults to `"layer_norm"`):
	The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
	final_dropout (`bool` optional, defaults to False):
	Whether to apply a final dropout after the last feed-forward layer.
	attention_type (`str`, optional, defaults to `"default"`):
	The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
	positional_embeddings (`str`, optional, defaults to `None`):
	The type of positional embeddings to apply to.
	num_positional_embeddings (`int`, optional, defaults to `None`):
	The maximum number of positional embeddings to apply.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	double_self_attention: bool = False,
	upcast_attention: bool = False,
	norm_elementwise_affine: bool = True,
	norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
	norm_eps: float = 1e-5,
	final_dropout: bool = False,
	attention_type: str = "default",
	positional_embeddings: Optional[str] = None,
	num_positional_embeddings: Optional[int] = None,
	attention_mode: str = "xformers"
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention

	self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
	self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
	self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
	self.use_layer_norm = norm_type == "layer_norm"

	if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
	raise ValueError(
	f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
	f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
	)

	if positional_embeddings and (num_positional_embeddings is None):
	raise ValueError(
	"If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
	)

	if positional_embeddings == "sinusoidal":
	self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
	else:
	self.pos_embed = None

	# Define 3 blocks. Each block has its own normalization layer.
	# 1. Self-Attn
	if self.use_ada_layer_norm:
	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
	elif self.use_ada_layer_norm_zero:
	self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
	else:
	self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)

	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	upcast_attention=upcast_attention,
	attention_mode=attention_mode
	)

	# 2. Cross-Attn
	if cross_attention_dim is not None or double_self_attention:
	# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
	# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
	# the second cross attention block.
	self.norm2 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
	)
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim if not double_self_attention else None,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	attention_mode='xformers', # only xformers support attention_mask
	) # is self-attn if encoder_hidden_states is none
	else:
	self.norm2 = None
	self.attn2 = None

	# 3. Feed-forward
	if not self.use_ada_layer_norm_single:
	self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)

	self.ff = FeedForward(
	dim,
	dropout=dropout,
	activation_fn=activation_fn,
	final_dropout=final_dropout,
	)

	# 4. Fuser
	if attention_type == "gated" or attention_type == "gated-text-image":
	self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)

	# 5. Scale-shift for PixArt-Alpha.
	if self.use_ada_layer_norm_single:
	self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)

	# let chunk size default to None
	self._chunk_size = None
	self._chunk_dim = 0

	def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
	# Sets chunk feed-forward
	self._chunk_size = chunk_size
	self._chunk_dim = dim

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	timestep: Optional[torch.LongTensor] = None,
	cross_attention_kwargs: Dict[str, Any] = None,
	class_labels: Optional[torch.LongTensor] = None,
	) -> torch.FloatTensor:
	# Notice that normalization is always applied before the real computation in the following blocks.
	# 0. Self-Attention
	batch_size = hidden_states.shape[0]

	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm1(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero:
	norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
	hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
	)
	elif self.use_layer_norm:
	norm_hidden_states = self.norm1(hidden_states)
	elif self.use_ada_layer_norm_single:
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
	self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
	).chunk(6, dim=1)
	norm_hidden_states = self.norm1(hidden_states)
	norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
	norm_hidden_states = norm_hidden_states.squeeze(1)
	else:
	raise ValueError("Incorrect norm used")

	if self.pos_embed is not None:
	norm_hidden_states = self.pos_embed(norm_hidden_states)

	# 1. Retrieve lora scale.
	lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0

	# 2. Prepare GLIGEN inputs
	cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
	gligen_kwargs = cross_attention_kwargs.pop("gligen", None)

	attn_output = self.attn1(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)
	if self.use_ada_layer_norm_zero:
	attn_output = gate_msa.unsqueeze(1) * attn_output
	elif self.use_ada_layer_norm_single:
	attn_output = gate_msa * attn_output

	hidden_states = attn_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	# 2.5 GLIGEN Control
	if gligen_kwargs is not None:
	hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])

	# 3. Cross-Attention
	if self.attn2 is not None:
	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm2(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero or self.use_layer_norm:
	norm_hidden_states = self.norm2(hidden_states)
	elif self.use_ada_layer_norm_single:
	# For PixArt norm2 isn't applied here:
	# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
	norm_hidden_states = hidden_states
	else:
	raise ValueError("Incorrect norm")

	if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
	norm_hidden_states = self.pos_embed(norm_hidden_states)

	attn_output = self.attn2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=encoder_attention_mask,
	**cross_attention_kwargs,
	)
	hidden_states = attn_output + hidden_states

	# 4. Feed-forward
	if not self.use_ada_layer_norm_single:
	norm_hidden_states = self.norm3(hidden_states)

	if self.use_ada_layer_norm_zero:
	norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]

	if self.use_ada_layer_norm_single:
	norm_hidden_states = self.norm2(hidden_states)
	norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp

	if self._chunk_size is not None:
	# "feed_forward_chunk_size" can be used to save memory
	ff_output = _chunked_feed_forward(
	self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
	)
	else:
	ff_output = self.ff(norm_hidden_states, scale=lora_scale)

	if self.use_ada_layer_norm_zero:
	ff_output = gate_mlp.unsqueeze(1) * ff_output
	elif self.use_ada_layer_norm_single:
	ff_output = gate_mlp * ff_output

	hidden_states = ff_output + hidden_states
	if hidden_states.ndim == 4:
	hidden_states = hidden_states.squeeze(1)

	return hidden_states

	class AdaLayerNormSingle(nn.Module):
	r"""
	Norm layer adaptive layer norm single (adaLN-single).

	As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).

	Parameters:
	embedding_dim (`int`): The size of each embedding vector.
	use_additional_conditions (`bool`): To use additional conditions for normalization or not.
	"""

	def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
	super().__init__()

	self.emb = CombinedTimestepSizeEmbeddings(
	embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
	)

	self.silu = nn.SiLU()
	self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)

	def forward(
	self,
	timestep: torch.Tensor,
	added_cond_kwargs: Dict[str, torch.Tensor] = None,
	batch_size: int = None,
	hidden_dtype: Optional[torch.dtype] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	# No modulation happening here.
	embedded_timestep = self.emb(timestep, batch_size=batch_size, hidden_dtype=hidden_dtype, resolution=None,
	aspect_ratio=None)
	return self.linear(self.silu(embedded_timestep)), embedded_timestep


	@dataclass
	class Transformer3DModelOutput(BaseOutput):
	"""
	The output of [`Transformer2DModel`].

	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
	The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
	distributions for the unnoised latent pixels.
	"""

	sample: torch.FloatTensor