Spaces:

gokaygokay
/

Hunyuan3D-1.0

Running on Zero

File size: 15,982 Bytes
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import torch
from transformers import CLIPTextModelWithProjection, CLIPTokenizer

from ...image_processor import VaeImageProcessor
from ...models import UVit2DModel, VQModel
from ...schedulers import AmusedScheduler
from ...utils import replace_example_docstring
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput


EXAMPLE_DOC_STRING = """

    Examples:

        ```py

        >>> import torch

        >>> from diffusers import AmusedPipeline



        >>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)

        >>> pipe = pipe.to("cuda")



        >>> prompt = "a photo of an astronaut riding a horse on mars"

        >>> image = pipe(prompt).images[0]

        ```

"""


class AmusedPipeline(DiffusionPipeline):
    image_processor: VaeImageProcessor
    vqvae: VQModel
    tokenizer: CLIPTokenizer
    text_encoder: CLIPTextModelWithProjection
    transformer: UVit2DModel
    scheduler: AmusedScheduler

    model_cpu_offload_seq = "text_encoder->transformer->vqvae"

    def __init__(

        self,

        vqvae: VQModel,

        tokenizer: CLIPTokenizer,

        text_encoder: CLIPTextModelWithProjection,

        transformer: UVit2DModel,

        scheduler: AmusedScheduler,

    ):
        super().__init__()

        self.register_modules(
            vqvae=vqvae,
            tokenizer=tokenizer,
            text_encoder=text_encoder,
            transformer=transformer,
            scheduler=scheduler,
        )
        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)

    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(

        self,

        prompt: Optional[Union[List[str], str]] = None,

        height: Optional[int] = None,

        width: Optional[int] = None,

        num_inference_steps: int = 12,

        guidance_scale: float = 10.0,

        negative_prompt: Optional[Union[str, List[str]]] = None,

        num_images_per_prompt: Optional[int] = 1,

        generator: Optional[torch.Generator] = None,

        latents: Optional[torch.IntTensor] = None,

        prompt_embeds: Optional[torch.Tensor] = None,

        encoder_hidden_states: Optional[torch.Tensor] = None,

        negative_prompt_embeds: Optional[torch.Tensor] = None,

        negative_encoder_hidden_states: Optional[torch.Tensor] = None,

        output_type="pil",

        return_dict: bool = True,

        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,

        callback_steps: int = 1,

        cross_attention_kwargs: Optional[Dict[str, Any]] = None,

        micro_conditioning_aesthetic_score: int = 6,

        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),

        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),

    ):
        """

        The call function to the pipeline for generation.



        Args:

            prompt (`str` or `List[str]`, *optional*):

                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.

            height (`int`, *optional*, defaults to `self.transformer.config.sample_size * self.vae_scale_factor`):

                The height in pixels of the generated image.

            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):

                The width in pixels of the generated image.

            num_inference_steps (`int`, *optional*, defaults to 16):

                The number of denoising steps. More denoising steps usually lead to a higher quality image at the

                expense of slower inference.

            guidance_scale (`float`, *optional*, defaults to 10.0):

                A higher guidance scale value encourages the model to generate images closely linked to the text

                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.

            negative_prompt (`str` or `List[str]`, *optional*):

                The prompt or prompts to guide what to not include in image generation. If not defined, you need to

                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).

            num_images_per_prompt (`int`, *optional*, defaults to 1):

                The number of images to generate per prompt.

            generator (`torch.Generator`, *optional*):

                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make

                generation deterministic.

            latents (`torch.IntTensor`, *optional*):

                Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image

                gneration. If not provided, the starting latents will be completely masked.

            prompt_embeds (`torch.FloatTensor`, *optional*):

                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not

                provided, text embeddings are generated from the `prompt` input argument. A single vector from the

                pooled and projected final hidden states.

            encoder_hidden_states (`torch.FloatTensor`, *optional*):

                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.

            negative_prompt_embeds (`torch.FloatTensor`, *optional*):

                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If

                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.

            negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):

                Analogous to `encoder_hidden_states` for the positive prompt.

            output_type (`str`, *optional*, defaults to `"pil"`):

                The output format of the generated image. Choose between `PIL.Image` or `np.array`.

            return_dict (`bool`, *optional*, defaults to `True`):

                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a

                plain tuple.

            callback (`Callable`, *optional*):

                A function that calls every `callback_steps` steps during inference. The function is called with the

                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.

            callback_steps (`int`, *optional*, defaults to 1):

                The frequency at which the `callback` function is called. If not specified, the callback is called at

                every step.

            cross_attention_kwargs (`dict`, *optional*):

                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in

                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):

                The targeted aesthetic score according to the laion aesthetic classifier. See

                https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of

                https://arxiv.org/abs/2307.01952.

            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):

                The targeted height, width crop coordinates. See the micro-conditioning section of

                https://arxiv.org/abs/2307.01952.

            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):

                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.



        Examples:



        Returns:

            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:

                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a

                `tuple` is returned where the first element is a list with the generated images.

        """
        if (prompt_embeds is not None and encoder_hidden_states is None) or (
            prompt_embeds is None and encoder_hidden_states is not None
        ):
            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")

        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
        ):
            raise ValueError(
                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
            )

        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
            raise ValueError("pass only one of `prompt` or `prompt_embeds`")

        if isinstance(prompt, str):
            prompt = [prompt]

        if prompt is not None:
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]

        batch_size = batch_size * num_images_per_prompt

        if height is None:
            height = self.transformer.config.sample_size * self.vae_scale_factor

        if width is None:
            width = self.transformer.config.sample_size * self.vae_scale_factor

        if prompt_embeds is None:
            input_ids = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=self.tokenizer.model_max_length,
            ).input_ids.to(self._execution_device)

            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
            prompt_embeds = outputs.text_embeds
            encoder_hidden_states = outputs.hidden_states[-2]

        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)

        if guidance_scale > 1.0:
            if negative_prompt_embeds is None:
                if negative_prompt is None:
                    negative_prompt = [""] * len(prompt)

                if isinstance(negative_prompt, str):
                    negative_prompt = [negative_prompt]

                input_ids = self.tokenizer(
                    negative_prompt,
                    return_tensors="pt",
                    padding="max_length",
                    truncation=True,
                    max_length=self.tokenizer.model_max_length,
                ).input_ids.to(self._execution_device)

                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
                negative_prompt_embeds = outputs.text_embeds
                negative_encoder_hidden_states = outputs.hidden_states[-2]

            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)

            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])

        # Note that the micro conditionings _do_ flip the order of width, height for the original size
        # and the crop coordinates. This is how it was done in the original code base
        micro_conds = torch.tensor(
            [
                width,
                height,
                micro_conditioning_crop_coord[0],
                micro_conditioning_crop_coord[1],
                micro_conditioning_aesthetic_score,
            ],
            device=self._execution_device,
            dtype=encoder_hidden_states.dtype,
        )
        micro_conds = micro_conds.unsqueeze(0)
        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)

        shape = (batch_size, height // self.vae_scale_factor, width // self.vae_scale_factor)

        if latents is None:
            latents = torch.full(
                shape, self.scheduler.config.mask_token_id, dtype=torch.long, device=self._execution_device
            )

        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)

        num_warmup_steps = len(self.scheduler.timesteps) - num_inference_steps * self.scheduler.order
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, timestep in enumerate(self.scheduler.timesteps):
                if guidance_scale > 1.0:
                    model_input = torch.cat([latents] * 2)
                else:
                    model_input = latents

                model_output = self.transformer(
                    model_input,
                    micro_conds=micro_conds,
                    pooled_text_emb=prompt_embeds,
                    encoder_hidden_states=encoder_hidden_states,
                    cross_attention_kwargs=cross_attention_kwargs,
                )

                if guidance_scale > 1.0:
                    uncond_logits, cond_logits = model_output.chunk(2)
                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)

                latents = self.scheduler.step(
                    model_output=model_output,
                    timestep=timestep,
                    sample=latents,
                    generator=generator,
                ).prev_sample

                if i == len(self.scheduler.timesteps) - 1 or (
                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
                ):
                    progress_bar.update()
                    if callback is not None and i % callback_steps == 0:
                        step_idx = i // getattr(self.scheduler, "order", 1)
                        callback(step_idx, timestep, latents)

        if output_type == "latent":
            output = latents
        else:
            needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast

            if needs_upcasting:
                self.vqvae.float()

            output = self.vqvae.decode(
                latents,
                force_not_quantize=True,
                shape=(
                    batch_size,
                    height // self.vae_scale_factor,
                    width // self.vae_scale_factor,
                    self.vqvae.config.latent_channels,
                ),
            ).sample.clip(0, 1)
            output = self.image_processor.postprocess(output, output_type)

            if needs_upcasting:
                self.vqvae.half()

        self.maybe_free_model_hooks()

        if not return_dict:
            return (output,)

        return ImagePipelineOutput(output)