Spaces:

jbilcke-hf
/

streamv2v_demo

Runtime error

+from typing import Optional
+from sfast.compilers.stable_diffusion_pipeline_compiler import CompilationConfig, compile
+from ...pipeline import StreamV2V
+def accelerate_with_stable_fast(
+    stream: StreamV2V,
+    config: Optional[CompilationConfig] = None,
+):
+    if config is None:
+        config = CompilationConfig.Default()
+        # xformers and Triton are suggested for achieving best performance.
+        try:
+            import xformers
+            config.enable_xformers = True
+        except ImportError:
+            print("xformers not installed, skip")
+        try:
+            import triton
+            config.enable_triton = True
+        except ImportError:
+            print("Triton not installed, skip")
+        # CUDA Graph is suggested for small batch sizes and small resolutions to reduce CPU overhead.
+        config.enable_cuda_graph = True
+    stream.pipe = compile(stream.pipe, config)
+    stream.unet = stream.pipe.unet
+    stream.vae = stream.pipe.vae
+    stream.text_encoder = stream.pipe.text_encoder
+    return stream

streamv2v/acceleration/tensorrt/__init__.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import gc
+import os
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
+    retrieve_latents,
+)
+from polygraphy import cuda
+from ...pipeline import StreamV2V
+from .builder import EngineBuilder, create_onnx_path
+from .engine import AutoencoderKLEngine, UNet2DConditionModelEngine
+from .models import VAE, BaseModel, UNet, VAEEncoder
+class TorchVAEEncoder(torch.nn.Module):
+    def __init__(self, vae: AutoencoderKL):
+        super().__init__()
+        self.vae = vae
+    def forward(self, x: torch.Tensor):
+        return retrieve_latents(self.vae.encode(x))
+def compile_vae_encoder(
+    vae: TorchVAEEncoder,
+    model_data: BaseModel,
+    onnx_path: str,
+    onnx_opt_path: str,
+    engine_path: str,
+    opt_batch_size: int = 1,
+    engine_build_options: dict = {},
+):
+    builder = EngineBuilder(model_data, vae, device=torch.device("cuda"))
+    builder.build(
+        onnx_path,
+        onnx_opt_path,
+        engine_path,
+        opt_batch_size=opt_batch_size,
+        **engine_build_options,
+    )
+def compile_vae_decoder(
+    vae: AutoencoderKL,
+    model_data: BaseModel,
+    onnx_path: str,
+    onnx_opt_path: str,
+    engine_path: str,
+    opt_batch_size: int = 1,
+    engine_build_options: dict = {},
+):
+    vae = vae.to(torch.device("cuda"))
+    builder = EngineBuilder(model_data, vae, device=torch.device("cuda"))
+    builder.build(
+        onnx_path,
+        onnx_opt_path,
+        engine_path,
+        opt_batch_size=opt_batch_size,
+        **engine_build_options,
+    )
+def compile_unet(
+    unet: UNet2DConditionModel,
+    model_data: BaseModel,
+    onnx_path: str,
+    onnx_opt_path: str,
+    engine_path: str,
+    opt_batch_size: int = 1,
+    engine_build_options: dict = {},
+):
+    unet = unet.to(torch.device("cuda"), dtype=torch.float16)
+    builder = EngineBuilder(model_data, unet, device=torch.device("cuda"))
+    builder.build(
+        onnx_path,
+        onnx_opt_path,
+        engine_path,
+        opt_batch_size=opt_batch_size,
+        **engine_build_options,
+    )
+def accelerate_with_tensorrt(
+    stream: StreamV2V,
+    engine_dir: str,
+    max_batch_size: int = 2,
+    min_batch_size: int = 1,
+    use_cuda_graph: bool = False,
+    engine_build_options: dict = {},
+):
+    if "opt_batch_size" not in engine_build_options or engine_build_options["opt_batch_size"] is None:
+        engine_build_options["opt_batch_size"] = max_batch_size
+    text_encoder = stream.text_encoder
+    unet = stream.unet
+    vae = stream.vae
+    del stream.unet, stream.vae, stream.pipe.unet, stream.pipe.vae
+    vae_config = vae.config
+    vae_dtype = vae.dtype
+    unet.to(torch.device("cpu"))
+    vae.to(torch.device("cpu"))
+    gc.collect()
+    torch.cuda.empty_cache()
+    onnx_dir = os.path.join(engine_dir, "onnx")
+    os.makedirs(onnx_dir, exist_ok=True)
+    unet_engine_path = f"{engine_dir}/unet.engine"
+    vae_encoder_engine_path = f"{engine_dir}/vae_encoder.engine"
+    vae_decoder_engine_path = f"{engine_dir}/vae_decoder.engine"
+    unet_model = UNet(
+        fp16=True,
+        device=stream.device,
+        max_batch_size=max_batch_size,
+        min_batch_size=min_batch_size,
+        embedding_dim=text_encoder.config.hidden_size,
+        unet_dim=unet.config.in_channels,
+    )
+    vae_decoder_model = VAE(
+        device=stream.device,
+        max_batch_size=max_batch_size,
+        min_batch_size=min_batch_size,
+    )
+    vae_encoder_model = VAEEncoder(
+        device=stream.device,
+        max_batch_size=max_batch_size,
+        min_batch_size=min_batch_size,
+    )
+    if not os.path.exists(unet_engine_path):
+        compile_unet(
+            unet,
+            unet_model,
+            create_onnx_path("unet", onnx_dir, opt=False),
+            create_onnx_path("unet", onnx_dir, opt=True),
+            unet_engine_path,
+            **engine_build_options,
+        )
+    else:
+        del unet
+    if not os.path.exists(vae_decoder_engine_path):
+        vae.forward = vae.decode
+        compile_vae_decoder(
+            vae,
+            vae_decoder_model,
+            create_onnx_path("vae_decoder", onnx_dir, opt=False),
+            create_onnx_path("vae_decoder", onnx_dir, opt=True),
+            vae_decoder_engine_path,
+            **engine_build_options,
+        )
+    if not os.path.exists(vae_encoder_engine_path):
+        vae_encoder = TorchVAEEncoder(vae).to(torch.device("cuda"))
+        compile_vae_encoder(
+            vae_encoder,
+            vae_encoder_model,
+            create_onnx_path("vae_encoder", onnx_dir, opt=False),
+            create_onnx_path("vae_encoder", onnx_dir, opt=True),
+            vae_encoder_engine_path,
+            **engine_build_options,
+        )
+    del vae
+    cuda_steram = cuda.Stream()
+    stream.unet = UNet2DConditionModelEngine(unet_engine_path, cuda_steram, use_cuda_graph=use_cuda_graph)
+    stream.vae = AutoencoderKLEngine(
+        vae_encoder_engine_path,
+        vae_decoder_engine_path,
+        cuda_steram,
+        stream.pipe.vae_scale_factor,
+        use_cuda_graph=use_cuda_graph,
+    )
+    setattr(stream.vae, "config", vae_config)
+    setattr(stream.vae, "dtype", vae_dtype)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return stream

streamv2v/acceleration/tensorrt/builder.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gc
+import os
+from typing import *
+import torch
+from .models import BaseModel
+from .utilities import (
+    build_engine,
+    export_onnx,
+    optimize_onnx,
+)
+def create_onnx_path(name, onnx_dir, opt=True):
+    return os.path.join(onnx_dir, name + (".opt" if opt else "") + ".onnx")
+class EngineBuilder:
+    def __init__(
+        self,
+        model: BaseModel,
+        network: Any,
+        device=torch.device("cuda"),
+    ):
+        self.device = device
+        self.model = model
+        self.network = network
+    def build(
+        self,
+        onnx_path: str,
+        onnx_opt_path: str,
+        engine_path: str,
+        opt_image_height: int = 512,
+        opt_image_width: int = 512,
+        opt_batch_size: int = 1,
+        min_image_resolution: int = 256,
+        max_image_resolution: int = 1024,
+        build_enable_refit: bool = False,
+        build_static_batch: bool = False,
+        build_dynamic_shape: bool = False,
+        build_all_tactics: bool = False,
+        onnx_opset: int = 17,
+        force_engine_build: bool = False,
+        force_onnx_export: bool = False,
+        force_onnx_optimize: bool = False,
+    ):
+        if not force_onnx_export and os.path.exists(onnx_path):
+            print(f"Found cached model: {onnx_path}")
+        else:
+            print(f"Exporting model: {onnx_path}")
+            export_onnx(
+                self.network,
+                onnx_path=onnx_path,
+                model_data=self.model,
+                opt_image_height=opt_image_height,
+                opt_image_width=opt_image_width,
+                opt_batch_size=opt_batch_size,
+                onnx_opset=onnx_opset,
+            )
+            del self.network
+            gc.collect()
+            torch.cuda.empty_cache()
+        if not force_onnx_optimize and os.path.exists(onnx_opt_path):
+            print(f"Found cached model: {onnx_opt_path}")
+        else:
+            print(f"Generating optimizing model: {onnx_opt_path}")
+            optimize_onnx(
+                onnx_path=onnx_path,
+                onnx_opt_path=onnx_opt_path,
+                model_data=self.model,
+            )
+        self.model.min_latent_shape = min_image_resolution // 8
+        self.model.max_latent_shape = max_image_resolution // 8
+        if not force_engine_build and os.path.exists(engine_path):
+            print(f"Found cached engine: {engine_path}")
+        else:
+            build_engine(
+                engine_path=engine_path,
+                onnx_opt_path=onnx_opt_path,
+                model_data=self.model,
+                opt_image_height=opt_image_height,
+                opt_image_width=opt_image_width,
+                opt_batch_size=opt_batch_size,
+                build_static_batch=build_static_batch,
+                build_dynamic_shape=build_dynamic_shape,
+                build_all_tactics=build_all_tactics,
+                build_enable_refit=build_enable_refit,
+            )
+        gc.collect()
+        torch.cuda.empty_cache()

streamv2v/acceleration/tensorrt/engine.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from typing import *
+import torch
+from diffusers.models.autoencoder_tiny import AutoencoderTinyOutput
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+from diffusers.models.vae import DecoderOutput
+from polygraphy import cuda
+from .utilities import Engine
+class UNet2DConditionModelEngine:
+    def __init__(self, filepath: str, stream: cuda.Stream, use_cuda_graph: bool = False):
+        self.engine = Engine(filepath)
+        self.stream = stream
+        self.use_cuda_graph = use_cuda_graph
+        self.engine.load()
+        self.engine.activate()
+    def __call__(
+        self,
+        latent_model_input: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> Any:
+        if timestep.dtype != torch.float32:
+            timestep = timestep.float()
+        self.engine.allocate_buffers(
+            shape_dict={
+                "sample": latent_model_input.shape,
+                "timestep": timestep.shape,
+                "encoder_hidden_states": encoder_hidden_states.shape,
+                "latent": latent_model_input.shape,
+            },
+            device=latent_model_input.device,
+        )
+        noise_pred = self.engine.infer(
+            {
+                "sample": latent_model_input,
+                "timestep": timestep,
+                "encoder_hidden_states": encoder_hidden_states,
+            },
+            self.stream,
+            use_cuda_graph=self.use_cuda_graph,
+        )["latent"]
+        return UNet2DConditionOutput(sample=noise_pred)
+    def to(self, *args, **kwargs):
+        pass
+    def forward(self, *args, **kwargs):
+        pass
+class AutoencoderKLEngine:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        stream: cuda.Stream,
+        scaling_factor: int,
+        use_cuda_graph: bool = False,
+    ):
+        self.encoder = Engine(encoder_path)
+        self.decoder = Engine(decoder_path)
+        self.stream = stream
+        self.vae_scale_factor = scaling_factor
+        self.use_cuda_graph = use_cuda_graph
+        self.encoder.load()
+        self.decoder.load()
+        self.encoder.activate()
+        self.decoder.activate()
+    def encode(self, images: torch.Tensor, **kwargs):
+        self.encoder.allocate_buffers(
+            shape_dict={
+                "images": images.shape,
+                "latent": (
+                    images.shape[0],
+                    4,
+                    images.shape[2] // self.vae_scale_factor,
+                    images.shape[3] // self.vae_scale_factor,
+                ),
+            },
+            device=images.device,
+        )
+        latents = self.encoder.infer(
+            {"images": images},
+            self.stream,
+            use_cuda_graph=self.use_cuda_graph,
+        )["latent"]
+        return AutoencoderTinyOutput(latents=latents)
+    def decode(self, latent: torch.Tensor, **kwargs):
+        self.decoder.allocate_buffers(
+            shape_dict={
+                "latent": latent.shape,
+                "images": (
+                    latent.shape[0],
+                    3,
+                    latent.shape[2] * self.vae_scale_factor,
+                    latent.shape[3] * self.vae_scale_factor,
+                ),
+            },
+            device=latent.device,
+        )
+        images = self.decoder.infer(
+            {"latent": latent},
+            self.stream,
+            use_cuda_graph=self.use_cuda_graph,
+        )["images"]
+        return DecoderOutput(sample=images)
+    def to(self, *args, **kwargs):
+        pass
+    def forward(self, *args, **kwargs):
+        pass

streamv2v/acceleration/tensorrt/models.py ADDED Viewed

	@@ -0,0 +1,434 @@

+#! fork: https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/models.py
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import onnx_graphsurgeon as gs
+import torch
+from onnx import shape_inference
+from polygraphy.backend.onnx.loader import fold_constants
+class Optimizer:
+    def __init__(self, onnx_graph, verbose=False):
+        self.graph = gs.import_onnx(onnx_graph)
+        self.verbose = verbose
+    def info(self, prefix):
+        if self.verbose:
+            print(
+                f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs"
+            )
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+class BaseModel:
+    def __init__(
+        self,
+        fp16=False,
+        device="cuda",
+        verbose=True,
+        max_batch_size=16,
+        min_batch_size=1,
+        embedding_dim=768,
+        text_maxlen=77,
+    ):
+        self.name = "SD Model"
+        self.fp16 = fp16
+        self.device = device
+        self.verbose = verbose
+        self.min_batch = min_batch_size
+        self.max_batch = max_batch_size
+        self.min_image_shape = 256  # min image resolution: 256x256
+        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_latent_shape = self.min_image_shape // 8
+        self.max_latent_shape = self.max_image_shape // 8
+        self.embedding_dim = embedding_dim
+        self.text_maxlen = text_maxlen
+    def get_model(self):
+        pass
+    def get_input_names(self):
+        pass
+    def get_output_names(self):
+        pass
+    def get_dynamic_axes(self):
+        return None
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info(self.name + ": original")
+        opt.cleanup()
+        opt.info(self.name + ": cleanup")
+        opt.fold_constants()
+        opt.info(self.name + ": fold constants")
+        opt.infer_shapes()
+        opt.info(self.name + ": shape inference")
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info(self.name + ": finished")
+        return onnx_opt_graph
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_image_height = image_height if static_shape else self.min_image_shape
+        max_image_height = image_height if static_shape else self.max_image_shape
+        min_image_width = image_width if static_shape else self.min_image_shape
+        max_image_width = image_width if static_shape else self.max_image_shape
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+class CLIP(BaseModel):
+    def __init__(self, device, max_batch_size, embedding_dim, min_batch_size=1):
+        super(CLIP, self).__init__(
+            device=device,
+            max_batch_size=max_batch_size,
+            min_batch_size=min_batch_size,
+            embedding_dim=embedding_dim,
+        )
+        self.name = "CLIP"
+    def get_input_names(self):
+        return ["input_ids"]
+    def get_output_names(self):
+        return ["text_embeddings", "pooler_output"]
+    def get_dynamic_axes(self):
+        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape
+        )
+        return {
+            "input_ids": [
+                (min_batch, self.text_maxlen),
+                (batch_size, self.text_maxlen),
+                (max_batch, self.text_maxlen),
+            ]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.info(self.name + ": original")
+        opt.select_outputs([0])  # delete graph output#1
+        opt.cleanup()
+        opt.info(self.name + ": remove output[1]")
+        opt.fold_constants()
+        opt.info(self.name + ": fold constants")
+        opt.infer_shapes()
+        opt.info(self.name + ": shape inference")
+        opt.select_outputs([0], names=["text_embeddings"])  # rename network output
+        opt.info(self.name + ": remove output[0]")
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        opt.info(self.name + ": finished")
+        return opt_onnx_graph
+class UNet(BaseModel):
+    def __init__(
+        self,
+        fp16=False,
+        device="cuda",
+        max_batch_size=16,
+        min_batch_size=1,
+        embedding_dim=768,
+        text_maxlen=77,
+        unet_dim=4,
+    ):
+        super(UNet, self).__init__(
+            fp16=fp16,
+            device=device,
+            max_batch_size=max_batch_size,
+            min_batch_size=min_batch_size,
+            embedding_dim=embedding_dim,
+            text_maxlen=text_maxlen,
+        )
+        self.unet_dim = unet_dim
+        self.name = "UNet"
+    def get_input_names(self):
+        return ["sample", "timestep", "encoder_hidden_states"]
+    def get_output_names(self):
+        return ["latent"]
+    def get_dynamic_axes(self):
+        return {
+            "sample": {0: "2B", 2: "H", 3: "W"},
+            "timestep": {0: "2B"},
+            "encoder_hidden_states": {0: "2B"},
+            "latent": {0: "2B", 2: "H", 3: "W"},
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "sample": [
+                (min_batch, self.unet_dim, min_latent_height, min_latent_width),
+                (batch_size, self.unet_dim, latent_height, latent_width),
+                (max_batch, self.unet_dim, max_latent_height, max_latent_width),
+            ],
+            "timestep": [(min_batch,), (batch_size,), (max_batch,)],
+            "encoder_hidden_states": [
+                (min_batch, self.text_maxlen, self.embedding_dim),
+                (batch_size, self.text_maxlen, self.embedding_dim),
+                (max_batch, self.text_maxlen, self.embedding_dim),
+            ],
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+            "timestep": (2 * batch_size,),
+            "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+            "latent": (2 * batch_size, 4, latent_height, latent_width),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(
+                2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+            ),
+            torch.ones((2 * batch_size,), dtype=torch.float32, device=self.device),
+            torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+        )
+class VAE(BaseModel):
+    def __init__(self, device, max_batch_size, min_batch_size=1):
+        super(VAE, self).__init__(
+            device=device,
+            max_batch_size=max_batch_size,
+            min_batch_size=min_batch_size,
+            embedding_dim=None,
+        )
+        self.name = "VAE decoder"
+    def get_input_names(self):
+        return ["latent"]
+    def get_output_names(self):
+        return ["images"]
+    def get_dynamic_axes(self):
+        return {
+            "latent": {0: "B", 2: "H", 3: "W"},
+            "images": {0: "B", 2: "8H", 3: "8W"},
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "latent": [
+                (min_batch, 4, min_latent_height, min_latent_width),
+                (batch_size, 4, latent_height, latent_width),
+                (max_batch, 4, max_latent_height, max_latent_width),
+            ]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, 4, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(
+            batch_size,
+            4,
+            latent_height,
+            latent_width,
+            dtype=torch.float32,
+            device=self.device,
+        )
+class VAEEncoder(BaseModel):
+    def __init__(self, device, max_batch_size, min_batch_size=1):
+        super(VAEEncoder, self).__init__(
+            device=device,
+            max_batch_size=max_batch_size,
+            min_batch_size=min_batch_size,
+            embedding_dim=None,
+        )
+        self.name = "VAE encoder"
+    def get_input_names(self):
+        return ["images"]
+    def get_output_names(self):
+        return ["latent"]
+    def get_dynamic_axes(self):
+        return {
+            "images": {0: "B", 2: "8H", 3: "8W"},
+            "latent": {0: "B", 2: "H", 3: "W"},
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            _,
+            _,
+            _,
+            _,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "images": [
+                (min_batch, 3, min_image_height, min_image_width),
+                (batch_size, 3, image_height, image_width),
+                (max_batch, 3, max_image_height, max_image_width),
+            ],
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "images": (batch_size, 3, image_height, image_width),
+            "latent": (batch_size, 4, latent_height, latent_width),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(
+            batch_size,
+            3,
+            image_height,
+            image_width,
+            dtype=torch.float32,
+            device=self.device,
+        )

streamv2v/acceleration/tensorrt/utilities.py ADDED Viewed

	@@ -0,0 +1,441 @@

+#! fork: https://github.com/NVIDIA/TensorRT/blob/main/demo/Diffusion/utilities.py
+#
+# Copyright 2022 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import gc
+from collections import OrderedDict
+from typing import *
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import tensorrt as trt
+import torch
+from cuda import cudart
+from PIL import Image
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.trt import (
+    CreateConfig,
+    Profile,
+    engine_from_bytes,
+    engine_from_network,
+    network_from_onnx_path,
+    save_engine,
+)
+from polygraphy.backend.trt import util as trt_util
+from .models import CLIP, VAE, BaseModel, UNet, VAEEncoder
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+def CUASSERT(cuda_ret):
+    err = cuda_ret[0]
+    if err != cudart.cudaError_t.cudaSuccess:
+        raise RuntimeError(
+            f"CUDA ERROR: {err}, error code reference: https://nvidia.github.io/cuda-python/module/cudart.html#cuda.cudart.cudaError_t"
+        )
+    if len(cuda_ret) > 1:
+        return cuda_ret[1]
+    return None
+class Engine:
+    def __init__(
+        self,
+        engine_path,
+    ):
+        self.engine_path = engine_path
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+        self.cuda_graph_instance = None  # cuda graph
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+    def refit(self, onnx_path, onnx_refit_path):
+        def convert_int64(arr):
+            # TODO: smarter conversion
+            if len(arr.shape) == 0:
+                return np.int32(arr)
+            return arr
+        def add_to_map(refit_dict, name, values):
+            if name in refit_dict:
+                assert refit_dict[name] is None
+                if values.dtype == np.int64:
+                    values = convert_int64(values)
+                refit_dict[name] = values
+        print(f"Refitting TensorRT engine with {onnx_refit_path} weights")
+        refit_nodes = gs.import_onnx(onnx.load(onnx_refit_path)).toposort().nodes
+        # Construct mapping from weight names in refit model -> original model
+        name_map = {}
+        for n, node in enumerate(gs.import_onnx(onnx.load(onnx_path)).toposort().nodes):
+            refit_node = refit_nodes[n]
+            assert node.op == refit_node.op
+            # Constant nodes in ONNX do not have inputs but have a constant output
+            if node.op == "Constant":
+                name_map[refit_node.outputs[0].name] = node.outputs[0].name
+            # Handle scale and bias weights
+            elif node.op == "Conv":
+                if node.inputs[1].__class__ == gs.Constant:
+                    name_map[refit_node.name + "_TRTKERNEL"] = node.name + "_TRTKERNEL"
+                if node.inputs[2].__class__ == gs.Constant:
+                    name_map[refit_node.name + "_TRTBIAS"] = node.name + "_TRTBIAS"
+            # For all other nodes: find node inputs that are initializers (gs.Constant)
+            else:
+                for i, inp in enumerate(node.inputs):
+                    if inp.__class__ == gs.Constant:
+                        name_map[refit_node.inputs[i].name] = inp.name
+        def map_name(name):
+            if name in name_map:
+                return name_map[name]
+            return name
+        # Construct refit dictionary
+        refit_dict = {}
+        refitter = trt.Refitter(self.engine, TRT_LOGGER)
+        all_weights = refitter.get_all()
+        for layer_name, role in zip(all_weights[0], all_weights[1]):
+            # for speciailized roles, use a unique name in the map:
+            if role == trt.WeightsRole.KERNEL:
+                name = layer_name + "_TRTKERNEL"
+            elif role == trt.WeightsRole.BIAS:
+                name = layer_name + "_TRTBIAS"
+            else:
+                name = layer_name
+            assert name not in refit_dict, "Found duplicate layer: " + name
+            refit_dict[name] = None
+        for n in refit_nodes:
+            # Constant nodes in ONNX do not have inputs but have a constant output
+            if n.op == "Constant":
+                name = map_name(n.outputs[0].name)
+                print(f"Add Constant {name}\n")
+                add_to_map(refit_dict, name, n.outputs[0].values)
+            # Handle scale and bias weights
+            elif n.op == "Conv":
+                if n.inputs[1].__class__ == gs.Constant:
+                    name = map_name(n.name + "_TRTKERNEL")
+                    add_to_map(refit_dict, name, n.inputs[1].values)
+                if n.inputs[2].__class__ == gs.Constant:
+                    name = map_name(n.name + "_TRTBIAS")
+                    add_to_map(refit_dict, name, n.inputs[2].values)
+            # For all other nodes: find node inputs that are initializers (AKA gs.Constant)
+            else:
+                for inp in n.inputs:
+                    name = map_name(inp.name)
+                    if inp.__class__ == gs.Constant:
+                        add_to_map(refit_dict, name, inp.values)
+        for layer_name, weights_role in zip(all_weights[0], all_weights[1]):
+            if weights_role == trt.WeightsRole.KERNEL:
+                custom_name = layer_name + "_TRTKERNEL"
+            elif weights_role == trt.WeightsRole.BIAS:
+                custom_name = layer_name + "_TRTBIAS"
+            else:
+                custom_name = layer_name
+            # Skip refitting Trilu for now; scalar weights of type int64 value 1 - for clip model
+            if layer_name.startswith("onnx::Trilu"):
+                continue
+            if refit_dict[custom_name] is not None:
+                refitter.set_weights(layer_name, weights_role, refit_dict[custom_name])
+            else:
+                print(f"[W] No refit weights for layer: {layer_name}")
+        if not refitter.refit_cuda_engine():
+            print("Failed to refit!")
+            exit(0)
+    def build(
+        self,
+        onnx_path,
+        fp16,
+        input_profile=None,
+        enable_refit=False,
+        enable_all_tactics=False,
+        timing_cache=None,
+        workspace_size=0,
+    ):
+        print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+        config_kwargs = {}
+        if workspace_size > 0:
+            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
+        if not enable_all_tactics:
+            config_kwargs["tactic_sources"] = []
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
+            config=CreateConfig(
+                fp16=fp16, refittable=enable_refit, profiles=[p], load_timing_cache=timing_cache, **config_kwargs
+            ),
+            save_timing_cache=timing_cache,
+        )
+        save_engine(engine, path=self.engine_path)
+    def load(self):
+        print(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+    def activate(self, reuse_device_memory=None):
+        if reuse_device_memory:
+            self.context = self.engine.create_execution_context_without_device_memory()
+            self.context.device_memory = reuse_device_memory
+        else:
+            self.context = self.engine.create_execution_context()
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+            binding = self.engine[idx]
+            if shape_dict and binding in shape_dict:
+                shape = shape_dict[binding]
+            else:
+                shape = self.engine.get_binding_shape(binding)
+            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            if self.engine.binding_is_input(binding):
+                self.context.set_binding_shape(idx, shape)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+            self.tensors[binding] = tensor
+    def infer(self, feed_dict, stream, use_cuda_graph=False):
+        for name, buf in feed_dict.items():
+            self.tensors[name].copy_(buf)
+        for name, tensor in self.tensors.items():
+            self.context.set_tensor_address(name, tensor.data_ptr())
+        if use_cuda_graph:
+            if self.cuda_graph_instance is not None:
+                CUASSERT(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream.ptr))
+                CUASSERT(cudart.cudaStreamSynchronize(stream.ptr))
+            else:
+                # do inference before CUDA graph capture
+                noerror = self.context.execute_async_v3(stream.ptr)
+                if not noerror:
+                    raise ValueError("ERROR: inference failed.")
+                # capture cuda graph
+                CUASSERT(
+                    cudart.cudaStreamBeginCapture(stream.ptr, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
+                )
+                self.context.execute_async_v3(stream.ptr)
+                self.graph = CUASSERT(cudart.cudaStreamEndCapture(stream.ptr))
+                self.cuda_graph_instance = CUASSERT(cudart.cudaGraphInstantiate(self.graph, 0))
+        else:
+            noerror = self.context.execute_async_v3(stream.ptr)
+            if not noerror:
+                raise ValueError("ERROR: inference failed.")
+        return self.tensors
+def decode_images(images: torch.Tensor):
+    images = (
+        ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy()
+    )
+    return [Image.fromarray(x) for x in images]
+def preprocess_image(image: Image.Image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h))
+    init_image = np.array(image).astype(np.float32) / 255.0
+    init_image = init_image[None].transpose(0, 3, 1, 2)
+    init_image = torch.from_numpy(init_image).contiguous()
+    return 2.0 * init_image - 1.0
+def prepare_mask_and_masked_image(image: Image.Image, mask: Image.Image):
+    if isinstance(image, Image.Image):
+        image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32).contiguous() / 127.5 - 1.0
+    if isinstance(mask, Image.Image):
+        mask = np.array(mask.convert("L"))
+        mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask).to(dtype=torch.float32).contiguous()
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+def create_models(
+    model_id: str,
+    use_auth_token: Optional[str],
+    device: Union[str, torch.device],
+    max_batch_size: int,
+    unet_in_channels: int = 4,
+    embedding_dim: int = 768,
+):
+    models = {
+        "clip": CLIP(
+            hf_token=use_auth_token,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+        ),
+        "unet": UNet(
+            hf_token=use_auth_token,
+            fp16=True,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+            unet_dim=unet_in_channels,
+        ),
+        "vae": VAE(
+            hf_token=use_auth_token,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+        ),
+        "vae_encoder": VAEEncoder(
+            hf_token=use_auth_token,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+        ),
+    }
+    return models
+def build_engine(
+    engine_path: str,
+    onnx_opt_path: str,
+    model_data: BaseModel,
+    opt_image_height: int,
+    opt_image_width: int,
+    opt_batch_size: int,
+    build_static_batch: bool = False,
+    build_dynamic_shape: bool = False,
+    build_all_tactics: bool = False,
+    build_enable_refit: bool = False,
+):
+    _, free_mem, _ = cudart.cudaMemGetInfo()
+    GiB = 2**30
+    if free_mem > 6 * GiB:
+        activation_carveout = 4 * GiB
+        max_workspace_size = free_mem - activation_carveout
+    else:
+        max_workspace_size = 0
+    engine = Engine(engine_path)
+    input_profile = model_data.get_input_profile(
+        opt_batch_size,
+        opt_image_height,
+        opt_image_width,
+        static_batch=build_static_batch,
+        static_shape=not build_dynamic_shape,
+    )
+    engine.build(
+        onnx_opt_path,
+        fp16=True,
+        input_profile=input_profile,
+        enable_refit=build_enable_refit,
+        enable_all_tactics=build_all_tactics,
+        workspace_size=max_workspace_size,
+    )
+    return engine
+def export_onnx(
+    model,
+    onnx_path: str,
+    model_data: BaseModel,
+    opt_image_height: int,
+    opt_image_width: int,
+    opt_batch_size: int,
+    onnx_opset: int,
+):
+    with torch.inference_mode(), torch.autocast("cuda"):
+        inputs = model_data.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+        torch.onnx.export(
+            model,
+            inputs,
+            onnx_path,
+            export_params=True,
+            opset_version=onnx_opset,
+            do_constant_folding=True,
+            input_names=model_data.get_input_names(),
+            output_names=model_data.get_output_names(),
+            dynamic_axes=model_data.get_dynamic_axes(),
+        )
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+def optimize_onnx(
+    onnx_path: str,
+    onnx_opt_path: str,
+    model_data: BaseModel,
+):
+    onnx_opt_graph = model_data.optimize(onnx.load(onnx_path))
+    onnx.save(onnx_opt_graph, onnx_opt_path)
+    del onnx_opt_graph
+    gc.collect()
+    torch.cuda.empty_cache()

streamv2v/image_filter.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Optional
+import random
+import torch
+class SimilarImageFilter:
+    def __init__(self, threshold: float = 0.98, max_skip_frame: float = 10) -> None:
+        self.threshold = threshold
+        self.prev_tensor = None
+        self.cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
+        self.max_skip_frame = max_skip_frame
+        self.skip_count = 0
+    def __call__(self, x: torch.Tensor) -> Optional[torch.Tensor]:
+        if self.prev_tensor is None:
+            self.prev_tensor = x.detach().clone()
+            return x
+        else:
+            cos_sim = self.cos(self.prev_tensor.reshape(-1), x.reshape(-1)).item()
+            sample = random.uniform(0, 1)
+            if self.threshold >= 1:
+                skip_prob = 0
+            else:
+                skip_prob = max(0, 1 - (1 - cos_sim) / (1 - self.threshold))
+            # not skip frame
+            if skip_prob < sample:
+                self.prev_tensor = x.detach().clone()
+                return x
+            # skip frame
+            else:
+                if self.skip_count > self.max_skip_frame:
+                    self.skip_count = 0
+                    self.prev_tensor = x.detach().clone()
+                    return x
+                else:
+                    self.skip_count += 1
+                    return None
+    def set_threshold(self, threshold: float) -> None:
+        self.threshold = threshold
+    def set_max_skip_frame(self, max_skip_frame: float) -> None:
+        self.max_skip_frame = max_skip_frame

streamv2v/image_utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from typing import List, Optional, Tuple, Union
+import numpy as np
+import PIL.Image
+import torch
+import torchvision
+def denormalize(images: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+    """
+    Denormalize an image array to [0,1].
+    """
+    return (images / 2 + 0.5).clamp(0, 1)
+def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+    """
+    Convert a PyTorch tensor to a NumPy image.
+    """
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    return images
+def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
+    """
+    Convert a NumPy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [
+            PIL.Image.fromarray(image.squeeze(), mode="L") for image in images
+        ]
+    else:
+        pil_images = [PIL.Image.fromarray(image) for image in images]
+    return pil_images
+def postprocess_image(
+    image: torch.Tensor,
+    output_type: str = "pil",
+    do_denormalize: Optional[List[bool]] = None,
+) -> Union[torch.Tensor, np.ndarray, PIL.Image.Image]:
+    if not isinstance(image, torch.Tensor):
+        raise ValueError(
+            f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+        )
+    if output_type == "latent":
+        return image
+    do_normalize_flg = True
+    if do_denormalize is None:
+        do_denormalize = [do_normalize_flg] * image.shape[0]
+    image = torch.stack(
+        [
+            denormalize(image[i]) if do_denormalize[i] else image[i]
+            for i in range(image.shape[0])
+        ]
+    )
+    if output_type == "pt":
+        return image
+    image = pt_to_numpy(image)
+    if output_type == "np":
+        return image
+    if output_type == "pil":
+        return numpy_to_pil(image)
+def process_image(
+    image_pil: PIL.Image.Image, range: Tuple[int, int] = (-1, 1)
+) -> Tuple[torch.Tensor, PIL.Image.Image]:
+    image = torchvision.transforms.ToTensor()(image_pil)
+    r_min, r_max = range[0], range[1]
+    image = image * (r_max - r_min) + r_min
+    return image[None, ...], image_pil
+def pil2tensor(image_pil: PIL.Image.Image) -> torch.Tensor:
+    height = image_pil.height
+    width = image_pil.width
+    imgs = []
+    img, _ = process_image(image_pil)
+    imgs.append(img)
+    imgs = torch.vstack(imgs)
+    images = torch.nn.functional.interpolate(
+        imgs, size=(height, width), mode="bilinear"
+    )
+    image_tensors = images.to(torch.float16)
+    return image_tensors
+### Optical flow utils
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+    stacks = [x, y]
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+    if device is not None:
+        grid = grid.to(device)
+    return grid
+def flow_warp(feature, flow, mask=False, padding_mode='zeros'):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+    return bilinear_sample(feature, grid, padding_mode=padding_mode,
+                           return_mask=mask)
+def bilinear_sample(img, sample_coords, mode='bilinear', padding_mode='zeros', return_mask=False):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+    b, _, h, w = sample_coords.shape
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+    img = torch.nn.functional.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+    if return_mask:
+        mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)  # [B, H, W]
+        return img, mask
+    return img
+def forward_backward_consistency_check(fwd_flow, bwd_flow,
+                                       alpha=0.1,
+                                       beta=0.5
+                                       ):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+    threshold = alpha * flow_mag + beta
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+    return fwd_occ, bwd_occ

streamv2v/models/__init__.py ADDED Viewed

File without changes

streamv2v/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,352 @@

+from importlib import import_module
+from typing import Callable, Optional, Union
+from collections import deque
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.attention_processor import Attention
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.lora import LoRACompatibleLinear, LoRALinearLayer
+from .utils import get_nn_feats, random_bipartite_soft_matching
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class CachedSTAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self, name=None, use_feature_injection=False,
+                 feature_injection_strength=0.8,
+                 feature_similarity_threshold=0.98,
+                 interval=4,
+                 max_frames=1,
+                 use_tome_cache=False,
+                 tome_metric="keys",
+                 use_grid=False,
+                 tome_ratio=0.5):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.name = name
+        self.use_feature_injection = use_feature_injection
+        self.fi_strength = feature_injection_strength
+        self.threshold = feature_similarity_threshold
+        self.zero_tensor = torch.tensor(0)
+        self.frame_id = torch.tensor(0)
+        self.interval = torch.tensor(interval)
+        self.max_frames = max_frames
+        self.cached_key = None
+        self.cached_value = None
+        self.cached_output = None
+        self.use_tome_cache = use_tome_cache
+        self.tome_metric = tome_metric
+        self.use_grid = use_grid
+        self.tome_ratio = tome_ratio
+    def _tome_step_kvout(self, keys, values, outputs):
+        keys = torch.cat([self.cached_key, keys], dim=1)
+        values = torch.cat([self.cached_value, values], dim=1)
+        outputs = torch.cat([self.cached_output, outputs], dim=1)
+        m_kv_out, _, _= random_bipartite_soft_matching(metric=keys, use_grid=self.use_grid, ratio=self.tome_ratio)
+        compact_keys, compact_values, compact_outputs = m_kv_out(keys, values, outputs)
+        self.cached_key = compact_keys
+        self.cached_value = compact_values
+        self.cached_output = compact_outputs
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+        is_selfattn = False
+        if encoder_hidden_states is None:
+            is_selfattn = True
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        if is_selfattn:
+            cached_key = key.clone()
+            cached_value = value.clone()
+            # Avoid if statement -> replace the dynamic graph to static graph
+            if torch.equal(self.frame_id, self.zero_tensor):
+            # ONNX
+                self.cached_key = cached_key
+                self.cached_value = cached_value
+            key = torch.cat([key, self.cached_key], dim=1)
+            value = torch.cat([value, self.cached_value], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if is_selfattn:
+            cached_output = hidden_states.clone()
+            if torch.equal(self.frame_id, self.zero_tensor):
+                self.cached_output = cached_output
+            if self.use_feature_injection and ("up_blocks.0" in self.name or "up_blocks.1" in self.name or 'mid_block' in self.name):
+                nn_hidden_states = get_nn_feats(hidden_states, self.cached_output, threshold=self.threshold)
+                hidden_states = hidden_states * (1-self.fi_strength) + self.fi_strength * nn_hidden_states
+        mod_result = torch.remainder(self.frame_id, self.interval)
+        if torch.equal(mod_result, self.zero_tensor) and is_selfattn:
+                self._tome_step_kvout(cached_key, cached_value, cached_output)
+        self.frame_id = self.frame_id + 1
+        return hidden_states
+class CachedSTXFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+    def __init__(self, attention_op: Optional[Callable] = None, name=None,
+                 use_feature_injection=False, feature_injection_strength=0.8, feature_similarity_threshold=0.98,
+                 interval=4, max_frames=4, use_tome_cache=False, tome_metric="keys", use_grid=False, tome_ratio=0.5):
+        self.attention_op = attention_op
+        self.name = name
+        self.use_feature_injection = use_feature_injection
+        self.fi_strength = feature_injection_strength
+        self.threshold = feature_similarity_threshold
+        self.frame_id = 0
+        self.interval = interval
+        self.cached_key = deque(maxlen=max_frames)
+        self.cached_value = deque(maxlen=max_frames)
+        self.cached_output = deque(maxlen=max_frames)
+        self.use_tome_cache = use_tome_cache
+        self.tome_metric = tome_metric
+        self.use_grid = use_grid
+        self.tome_ratio = tome_ratio
+    def _tome_step_kvout(self, keys, values, outputs):
+        if len(self.cached_value) == 1:
+            keys = torch.cat(list(self.cached_key) + [keys], dim=1)
+            values = torch.cat(list(self.cached_value) + [values], dim=1)
+            outputs = torch.cat(list(self.cached_output) + [outputs], dim=1)
+            m_kv_out, _, _= random_bipartite_soft_matching(metric=eval(self.tome_metric), use_grid=self.use_grid, ratio=self.tome_ratio)
+            compact_keys, compact_values, compact_outputs = m_kv_out(keys, values, outputs)
+            self.cached_key.append(compact_keys)
+            self.cached_value.append(compact_values)
+            self.cached_output.append(compact_outputs)
+        else:
+            self.cached_key.append(keys)
+            self.cached_value.append(values)
+            self.cached_output.append(outputs)
+    def _tome_step_kv(self, keys, values):
+        if len(self.cached_value) == 1:
+            keys = torch.cat(list(self.cached_key) + [keys], dim=1)
+            values = torch.cat(list(self.cached_value) + [values], dim=1)
+            _, m_kv, _= random_bipartite_soft_matching(metric=eval(self.tome_metric), use_grid=self.use_grid, ratio=self.tome_ratio)
+            compact_keys, compact_values = m_kv(keys, values)
+            self.cached_key.append(compact_keys)
+            self.cached_value.append(compact_values)
+        else:
+            self.cached_key.append(keys)
+            self.cached_value.append(values)
+    def _tome_step_out(self, outputs):
+        if len(self.cached_value) == 1:
+            outputs = torch.cat(list(self.cached_output) + [outputs], dim=1)
+            _, _, m_out= random_bipartite_soft_matching(metric=outputs, use_grid=self.use_grid, ratio=self.tome_ratio)
+            compact_outputs = m_out(outputs)
+            self.cached_output.append(compact_outputs)
+        else:
+            self.cached_output.append(outputs)
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        is_selfattn = False
+        if encoder_hidden_states is None:
+            is_selfattn = True
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        if is_selfattn:
+            cached_key = key.clone()
+            cached_value = value.clone()
+            if len(self.cached_key) > 0:
+                key = torch.cat([key] + list(self.cached_key), dim=1)
+                value = torch.cat([value] + list(self.cached_value), dim=1)
+            ## Code for storing and visualizing features
+            # if self.frame_id % self.interval == 0:
+            #     # if "down_blocks.0" in self.name or "up_blocks.3" in self.name:
+            #     #     feats = {
+            #     #                 "hidden_states": hidden_states.clone().cpu(),
+            #     #                 "query": query.clone().cpu(),
+            #     #                 "key": cached_key.cpu(),
+            #     #                 "value": cached_value.cpu(),
+            #     #             }
+            #     #     torch.save(feats, f'./outputs/self_attn_feats_SD/{self.name}.frame{self.frame_id}.pt')
+            #     if self.use_tome_cache:
+            #         cached_key, cached_value = self._tome_step(cached_key, cached_value)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if is_selfattn:
+            cached_output = hidden_states.clone()
+            if self.use_feature_injection and ("up_blocks.0" in self.name or "up_blocks.1" in self.name or 'mid_block' in self.name):
+                if len(self.cached_output) > 0:
+                    nn_hidden_states = get_nn_feats(hidden_states, self.cached_output, threshold=self.threshold)
+                    hidden_states = hidden_states * (1-self.fi_strength) + self.fi_strength * nn_hidden_states
+        if self.frame_id % self.interval == 0:
+            if is_selfattn:
+                if self.use_tome_cache:
+                    self._tome_step_kvout(cached_key, cached_value, cached_output)
+                else:
+                    self.cached_key.append(cached_key)
+                    self.cached_value.append(cached_value)
+                    self.cached_output.append(cached_output)
+        self.frame_id += 1
+        return hidden_states

streamv2v/models/utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from collections import deque
+from typing import Tuple, Callable
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+def get_nn_feats(x, y, threshold=0.9):
+    if type(x) is deque:
+        x = torch.cat(list(x), dim=1)
+    if type(y) is deque:
+        y = torch.cat(list(y), dim=1)
+    x_norm = F.normalize(x, p=2, dim=-1)
+    y_norm = F.normalize(y, p=2, dim=-1)
+    cosine_similarity = torch.matmul(x_norm, y_norm.transpose(1, 2))
+    max_cosine_values, nearest_neighbors_indices = torch.max(cosine_similarity, dim=-1)
+    mask = max_cosine_values < threshold
+    # print('mask ratio', torch.sum(mask)/x.shape[0]/x.shape[1])
+    indices_expanded = nearest_neighbors_indices.unsqueeze(-1).expand(-1, -1, x_norm.size(-1))
+    nearest_neighbor_tensor = torch.gather(y, 1, indices_expanded)
+    selected_tensor = torch.where(mask.unsqueeze(-1), x, nearest_neighbor_tensor)
+    return selected_tensor
+def get_nn_latent(x, y, threshold=0.9):
+    assert len(x.shape) == 4
+    _, c, h, w = x.shape
+    x_ = rearrange(x, 'n c h w -> n (h w) c')
+    y_ = []
+    for i in range(len(y)):
+        y_.append(rearrange(y[i], 'n c h w -> n (h w) c'))
+    y_ = torch.cat(y_, dim=1)
+    x_norm = F.normalize(x_, p=2, dim=-1)
+    y_norm = F.normalize(y_, p=2, dim=-1)
+    cosine_similarity = torch.matmul(x_norm, y_norm.transpose(1, 2))
+    max_cosine_values, nearest_neighbors_indices = torch.max(cosine_similarity, dim=-1)
+    mask = max_cosine_values < threshold
+    indices_expanded = nearest_neighbors_indices.unsqueeze(-1).expand(-1, -1, x_norm.size(-1))
+    nearest_neighbor_tensor = torch.gather(y_, 1, indices_expanded)
+    # Use values from x where the cosine similarity is below the threshold
+    x_expanded = x_.expand_as(nearest_neighbor_tensor)
+    selected_tensor = torch.where(mask.unsqueeze(-1), x_expanded, nearest_neighbor_tensor)
+    selected_tensor = rearrange(selected_tensor, 'n (h w) c -> n c h w', h=h, w=w, c=c)
+    return selected_tensor
+def random_bipartite_soft_matching(
+    metric: torch.Tensor, use_grid: bool = False, ratio: float = 0.5
+) -> Tuple[Callable, Callable]:
+    """
+    Applies ToMe with the two sets as (r chosen randomly, the rest).
+    Input size is [batch, tokens, channels].
+    This will reduce the number of tokens by a ratio of ratio/2.
+    """
+    with torch.no_grad():
+        B, N, _ = metric.shape
+        if use_grid:
+            assert ratio == 0.5
+            sample = torch.randint(2, size=(B, N//2, 1), device=metric.device)
+            sample_alternate = 1 - sample
+            grid = torch.arange(0, N, 2).view(1, N//2, 1).to(device=metric.device)
+            grid = grid.repeat(4, 1, 1)
+            rand_idx = torch.cat([sample + grid, sample_alternate + grid], dim = 1)
+        else:
+            rand_idx = torch.rand(B, N, 1, device=metric.device).argsort(dim=1)
+        r = int(ratio * N)
+        a_idx = rand_idx[:, :r, :]
+        b_idx = rand_idx[:, r:, :]
+        def split(x):
+            C = x.shape[-1]
+            a = x.gather(dim=1, index=a_idx.expand(B, r, C))
+            b = x.gather(dim=1, index=b_idx.expand(B, N - r, C))
+            return a, b
+        metric = metric / metric.norm(dim=-1, keepdim=True)
+        a, b = split(metric)
+        scores = a @ b.transpose(-1, -2)
+        _, dst_idx = scores.max(dim=-1)
+        dst_idx = dst_idx[..., None]
+    def merge_kv_out(keys: torch.Tensor, values: torch.Tensor, outputs: torch.Tensor, mode="mean") -> torch.Tensor:
+        src_keys, dst_keys = split(keys)
+        C_keys = src_keys.shape[-1]
+        dst_keys = dst_keys.scatter_reduce(-2, dst_idx.expand(B, r, C_keys), src_keys, reduce=mode)
+        src_values, dst_values = split(values)
+        C_values = src_values.shape[-1]
+        dst_values = dst_values.scatter_reduce(-2, dst_idx.expand(B, r, C_values), src_values, reduce=mode)
+        src_outputs, dst_outputs = split(outputs)
+        C_outputs = src_outputs.shape[-1]
+        dst_outputs = dst_outputs.scatter_reduce(-2, dst_idx.expand(B, r, C_outputs), src_outputs, reduce=mode)
+        return dst_keys, dst_values, dst_outputs
+    def merge_kv(keys: torch.Tensor, values: torch.Tensor, mode="mean") -> torch.Tensor:
+        src_keys, dst_keys = split(keys)
+        C_keys = src_keys.shape[-1]
+        dst_keys = dst_keys.scatter_reduce(-2, dst_idx.expand(B, r, C_keys), src_keys, reduce=mode)
+        src_values, dst_values = split(values)
+        C_values = src_values.shape[-1]
+        dst_values = dst_values.scatter_reduce(-2, dst_idx.expand(B, r, C_values), src_values, reduce=mode)
+        return dst_keys, dst_values
+    def merge_out(outputs: torch.Tensor, mode="mean") -> torch.Tensor:
+        src_outputs, dst_outputs = split(outputs)
+        C_outputs = src_outputs.shape[-1]
+        dst_outputs = dst_outputs.scatter_reduce(-2, dst_idx.expand(B, r, C_outputs), src_outputs, reduce=mode)
+        return dst_outputs
+    return merge_kv_out, merge_kv, merge_out

streamv2v/pip_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import importlib
+import importlib.util
+import os
+import subprocess
+import sys
+from typing import Dict, Optional
+from packaging.version import Version
+python = sys.executable
+index_url = os.environ.get("INDEX_URL", "")
+def version(package: str) -> Optional[Version]:
+    try:
+        return Version(importlib.import_module(package).__version__)
+    except ModuleNotFoundError:
+        return None
+def is_installed(package: str) -> bool:
+    try:
+        spec = importlib.util.find_spec(package)
+    except ModuleNotFoundError:
+        return False
+    return spec is not None
+def run_python(command: str, env: Dict[str, str] = None) -> str:
+    run_kwargs = {
+        "args": f"\"{python}\" {command}",
+        "shell": True,
+        "env": os.environ if env is None else env,
+        "encoding": "utf8",
+        "errors": "ignore",
+    }
+    print(run_kwargs["args"])
+    result = subprocess.run(**run_kwargs)
+    if result.returncode != 0:
+        print(f"Error running command: {command}", file=sys.stderr)
+        raise RuntimeError(f"Error running command: {command}")
+    return result.stdout or ""
+def run_pip(command: str, env: Dict[str, str] = None) -> str:
+    return run_python(f"-m pip {command}", env)

streamv2v/pipeline.py ADDED Viewed

	@@ -0,0 +1,495 @@

+import glob
+import os
+import time
+from typing import List, Optional, Union, Any, Dict, Tuple, Literal
+from collections import deque
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from torchvision.models.optical_flow import raft_small
+from diffusers import LCMScheduler, StableDiffusionPipeline
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
+    retrieve_latents,
+)
+from .image_utils import postprocess_image, forward_backward_consistency_check
+from .models.utils import get_nn_latent
+from .image_filter import SimilarImageFilter
+class StreamV2V:
+    def __init__(
+        self,
+        pipe: StableDiffusionPipeline,
+        t_index_list: List[int],
+        torch_dtype: torch.dtype = torch.float16,
+        width: int = 512,
+        height: int = 512,
+        do_add_noise: bool = True,
+        use_denoising_batch: bool = True,
+        frame_buffer_size: int = 1,
+        cfg_type: Literal["none", "full", "self", "initialize"] = "self",
+    ) -> None:
+        self.device = pipe.device
+        self.dtype = torch_dtype
+        self.generator = None
+        self.height = height
+        self.width = width
+        self.latent_height = int(height // pipe.vae_scale_factor)
+        self.latent_width = int(width // pipe.vae_scale_factor)
+        self.frame_bff_size = frame_buffer_size
+        self.denoising_steps_num = len(t_index_list)
+        self.cfg_type = cfg_type
+        if use_denoising_batch:
+            self.batch_size = self.denoising_steps_num * frame_buffer_size
+            if self.cfg_type == "initialize":
+                self.trt_unet_batch_size = (
+                    self.denoising_steps_num + 1
+                ) * self.frame_bff_size
+            elif self.cfg_type == "full":
+                self.trt_unet_batch_size = (
+                    2 * self.denoising_steps_num * self.frame_bff_size
+                )
+            else:
+                self.trt_unet_batch_size = self.denoising_steps_num * frame_buffer_size
+        else:
+            self.trt_unet_batch_size = self.frame_bff_size
+            self.batch_size = frame_buffer_size
+        self.t_list = t_index_list
+        self.do_add_noise = do_add_noise
+        self.use_denoising_batch = use_denoising_batch
+        self.similar_image_filter = False
+        self.similar_filter = SimilarImageFilter()
+        self.prev_image_tensor = None
+        self.prev_x_t_latent = None
+        self.prev_image_result = None
+        self.pipe = pipe
+        self.image_processor = VaeImageProcessor(pipe.vae_scale_factor)
+        self.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+        self.text_encoder = pipe.text_encoder
+        self.unet = pipe.unet
+        self.vae = pipe.vae
+        self.flow_model = raft_small(pretrained=True, progress=False).to(device=pipe.device).eval()
+        self.cached_x_t_latent = deque(maxlen=4)
+        self.inference_time_ema = 0
+    def load_lcm_lora(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[
+            str, Dict[str, torch.Tensor]
+        ] = "latent-consistency/lcm-lora-sdv1-5",
+        adapter_name: Optional[Any] = 'lcm',
+        **kwargs,
+    ) -> None:
+        self.pipe.load_lora_weights(
+            pretrained_model_name_or_path_or_dict, adapter_name, **kwargs
+        )
+    def load_lora(
+        self,
+        pretrained_lora_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name: Optional[Any] = None,
+        **kwargs,
+    ) -> None:
+        self.pipe.load_lora_weights(
+            pretrained_lora_model_name_or_path_or_dict, adapter_name, **kwargs
+        )
+    def fuse_lora(
+        self,
+        fuse_unet: bool = True,
+        fuse_text_encoder: bool = True,
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+    ) -> None:
+        self.pipe.fuse_lora(
+            fuse_unet=fuse_unet,
+            fuse_text_encoder=fuse_text_encoder,
+            lora_scale=lora_scale,
+            safe_fusing=safe_fusing,
+        )
+    def enable_similar_image_filter(self, threshold: float = 0.98, max_skip_frame: float = 10) -> None:
+        self.similar_image_filter = True
+        self.similar_filter.set_threshold(threshold)
+        self.similar_filter.set_max_skip_frame(max_skip_frame)
+    def disable_similar_image_filter(self) -> None:
+        self.similar_image_filter = False
+    @torch.no_grad()
+    def prepare(
+        self,
+        prompt: str,
+        negative_prompt: str = "",
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.2,
+        delta: float = 1.0,
+        generator: Optional[torch.Generator] = torch.Generator(),
+        seed: int = 2,
+    ) -> None:
+        self.generator = generator
+        self.generator.manual_seed(seed)
+        # initialize x_t_latent (it can be any random tensor)
+        if self.denoising_steps_num > 1:
+            self.x_t_latent_buffer = torch.zeros(
+                (
+                    (self.denoising_steps_num - 1) * self.frame_bff_size,
+                    4,
+                    self.latent_height,
+                    self.latent_width,
+                ),
+                dtype=self.dtype,
+                device=self.device,
+            )
+        else:
+            self.x_t_latent_buffer = None
+        if self.cfg_type == "none":
+            self.guidance_scale = 1.0
+        else:
+            self.guidance_scale = guidance_scale
+        self.delta = delta
+        do_classifier_free_guidance = False
+        if self.guidance_scale > 1.0:
+            do_classifier_free_guidance = True
+        encoder_output = self.pipe.encode_prompt(
+            prompt=prompt,
+            device=self.device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=True,
+            negative_prompt=negative_prompt,
+        )
+        self.prompt_embeds = encoder_output[0].repeat(self.batch_size, 1, 1)
+        self.null_prompt_embeds = encoder_output[1]
+        if self.use_denoising_batch and self.cfg_type == "full":
+            uncond_prompt_embeds = encoder_output[1].repeat(self.batch_size, 1, 1)
+        elif self.cfg_type == "initialize":
+            uncond_prompt_embeds = encoder_output[1].repeat(self.frame_bff_size, 1, 1)
+        if self.guidance_scale > 1.0 and (
+            self.cfg_type == "initialize" or self.cfg_type == "full"
+        ):
+            self.prompt_embeds = torch.cat(
+                [uncond_prompt_embeds, self.prompt_embeds], dim=0
+            )
+        self.scheduler.set_timesteps(num_inference_steps, self.device)
+        self.timesteps = self.scheduler.timesteps.to(self.device)
+        # make sub timesteps list based on the indices in the t_list list and the values in the timesteps list
+        self.sub_timesteps = []
+        for t in self.t_list:
+            self.sub_timesteps.append(self.timesteps[t])
+        sub_timesteps_tensor = torch.tensor(
+            self.sub_timesteps, dtype=torch.long, device=self.device
+        )
+        self.sub_timesteps_tensor = torch.repeat_interleave(
+            sub_timesteps_tensor,
+            repeats=self.frame_bff_size if self.use_denoising_batch else 1,
+            dim=0,
+        )
+        self.init_noise = torch.randn(
+            (self.batch_size, 4, self.latent_height, self.latent_width),
+            generator=generator,
+        ).to(device=self.device, dtype=self.dtype)
+        self.randn_noise = self.init_noise[:1].clone()
+        self.warp_noise = self.init_noise[:1].clone()
+        self.stock_noise = torch.zeros_like(self.init_noise)
+        c_skip_list = []
+        c_out_list = []
+        for timestep in self.sub_timesteps:
+            c_skip, c_out = self.scheduler.get_scalings_for_boundary_condition_discrete(
+                timestep
+            )
+            c_skip_list.append(c_skip)
+            c_out_list.append(c_out)
+        self.c_skip = (
+            torch.stack(c_skip_list)
+            .view(len(self.t_list), 1, 1, 1)
+            .to(dtype=self.dtype, device=self.device)
+        )
+        self.c_out = (
+            torch.stack(c_out_list)
+            .view(len(self.t_list), 1, 1, 1)
+            .to(dtype=self.dtype, device=self.device)
+        )
+        alpha_prod_t_sqrt_list = []
+        beta_prod_t_sqrt_list = []
+        for timestep in self.sub_timesteps:
+            alpha_prod_t_sqrt = self.scheduler.alphas_cumprod[timestep].sqrt()
+            beta_prod_t_sqrt = (1 - self.scheduler.alphas_cumprod[timestep]).sqrt()
+            alpha_prod_t_sqrt_list.append(alpha_prod_t_sqrt)
+            beta_prod_t_sqrt_list.append(beta_prod_t_sqrt)
+        alpha_prod_t_sqrt = (
+            torch.stack(alpha_prod_t_sqrt_list)
+            .view(len(self.t_list), 1, 1, 1)
+            .to(dtype=self.dtype, device=self.device)
+        )
+        beta_prod_t_sqrt = (
+            torch.stack(beta_prod_t_sqrt_list)
+            .view(len(self.t_list), 1, 1, 1)
+            .to(dtype=self.dtype, device=self.device)
+        )
+        self.alpha_prod_t_sqrt = torch.repeat_interleave(
+            alpha_prod_t_sqrt,
+            repeats=self.frame_bff_size if self.use_denoising_batch else 1,
+            dim=0,
+        )
+        self.beta_prod_t_sqrt = torch.repeat_interleave(
+            beta_prod_t_sqrt,
+            repeats=self.frame_bff_size if self.use_denoising_batch else 1,
+            dim=0,
+        )
+    @torch.no_grad()
+    def update_prompt(self, prompt: str) -> None:
+        encoder_output = self.pipe.encode_prompt(
+            prompt=prompt,
+            device=self.device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=False,
+        )
+        self.prompt_embeds = encoder_output[0].repeat(self.batch_size, 1, 1)
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        t_index: int,
+    ) -> torch.Tensor:
+        noisy_samples = (
+            self.alpha_prod_t_sqrt[t_index] * original_samples
+            + self.beta_prod_t_sqrt[t_index] * noise
+        )
+        return noisy_samples
+    def scheduler_step_batch(
+        self,
+        model_pred_batch: torch.Tensor,
+        x_t_latent_batch: torch.Tensor,
+        idx: Optional[int] = None,
+    ) -> torch.Tensor:
+        # TODO: use t_list to select beta_prod_t_sqrt
+        if idx is None:
+            F_theta = (
+                x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
+            ) / self.alpha_prod_t_sqrt
+            denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
+        else:
+            F_theta = (
+                x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
+            ) / self.alpha_prod_t_sqrt[idx]
+            denoised_batch = (
+                self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
+            )
+        return denoised_batch
+    def unet_step(
+        self,
+        x_t_latent: torch.Tensor,
+        t_list: Union[torch.Tensor, list[int]],
+        idx: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.guidance_scale > 1.0 and (self.cfg_type == "initialize"):
+            x_t_latent_plus_uc = torch.concat([x_t_latent[0:1], x_t_latent], dim=0)
+            t_list = torch.concat([t_list[0:1], t_list], dim=0)
+        elif self.guidance_scale > 1.0 and (self.cfg_type == "full"):
+            x_t_latent_plus_uc = torch.concat([x_t_latent, x_t_latent], dim=0)
+            t_list = torch.concat([t_list, t_list], dim=0)
+        else:
+            x_t_latent_plus_uc = x_t_latent
+        model_pred = self.unet(
+            x_t_latent_plus_uc,
+            t_list,
+            encoder_hidden_states=self.prompt_embeds,
+            return_dict=False,
+        )[0]
+        if self.guidance_scale > 1.0 and (self.cfg_type == "initialize"):
+            noise_pred_text = model_pred[1:]
+            self.stock_noise = torch.concat(
+                [model_pred[0:1], self.stock_noise[1:]], dim=0
+            )  # ここコメントアウトでself out cfg
+        elif self.guidance_scale > 1.0 and (self.cfg_type == "full"):
+            noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
+        else:
+            noise_pred_text = model_pred
+        if self.guidance_scale > 1.0 and (
+            self.cfg_type == "self" or self.cfg_type == "initialize"
+        ):
+            noise_pred_uncond = self.stock_noise * self.delta
+        if self.guidance_scale > 1.0 and self.cfg_type != "none":
+            model_pred = noise_pred_uncond + self.guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        else:
+            model_pred = noise_pred_text
+        # compute the previous noisy sample x_t -> x_t-1
+        if self.use_denoising_batch:
+            denoised_batch = self.scheduler_step_batch(model_pred, x_t_latent, idx)
+            if self.cfg_type == "self" or self.cfg_type == "initialize":
+                scaled_noise = self.beta_prod_t_sqrt * self.stock_noise
+                delta_x = self.scheduler_step_batch(model_pred, scaled_noise, idx)
+                alpha_next = torch.concat(
+                    [
+                        self.alpha_prod_t_sqrt[1:],
+                        torch.ones_like(self.alpha_prod_t_sqrt[0:1]),
+                    ],
+                    dim=0,
+                )
+                delta_x = alpha_next * delta_x
+                beta_next = torch.concat(
+                    [
+                        self.beta_prod_t_sqrt[1:],
+                        torch.ones_like(self.beta_prod_t_sqrt[0:1]),
+                    ],
+                    dim=0,
+                )
+                delta_x = delta_x / beta_next
+                init_noise = torch.concat(
+                    [self.init_noise[1:], self.init_noise[0:1]], dim=0
+                )
+                self.stock_noise = init_noise + delta_x
+        else:
+            # denoised_batch = self.scheduler.step(model_pred, t_list[0], x_t_latent).denoised
+            denoised_batch = self.scheduler_step_batch(model_pred, x_t_latent, idx)
+        return denoised_batch, model_pred
+    def norm_noise(self, noise):
+        # Compute mean and std of blended_noise
+        mean = noise.mean()
+        std = noise.std()
+        # Normalize blended_noise to have mean=0 and std=1
+        normalized_noise = (noise - mean) / std
+        return normalized_noise
+    def encode_image(self, image_tensors: torch.Tensor) -> torch.Tensor:
+        image_tensors = image_tensors.to(
+            device=self.device,
+            dtype=self.vae.dtype,
+        )
+        img_latent = retrieve_latents(self.vae.encode(image_tensors), self.generator)
+        img_latent = img_latent * self.vae.config.scaling_factor
+        x_t_latent = self.add_noise(img_latent, self.init_noise[0], 0)
+        return x_t_latent
+    def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
+        output_latent = self.vae.decode(
+            x_0_pred_out / self.vae.config.scaling_factor, return_dict=False
+        )[0]
+        return output_latent
+    def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
+        prev_latent_batch = self.x_t_latent_buffer
+        if self.use_denoising_batch:
+            t_list = self.sub_timesteps_tensor
+            if self.denoising_steps_num > 1:
+                x_t_latent = torch.cat((x_t_latent, prev_latent_batch), dim=0)
+                self.stock_noise = torch.cat(
+                    (self.init_noise[0:1], self.stock_noise[:-1]), dim=0
+                )
+            x_0_pred_batch, model_pred = self.unet_step(x_t_latent, t_list)
+            if self.denoising_steps_num > 1:
+                x_0_pred_out = x_0_pred_batch[-1].unsqueeze(0)
+                if self.do_add_noise:
+                    self.x_t_latent_buffer = (
+                        self.alpha_prod_t_sqrt[1:] * x_0_pred_batch[:-1]
+                        + self.beta_prod_t_sqrt[1:] * self.init_noise[1:]
+                    )
+                else:
+                    self.x_t_latent_buffer = (
+                        self.alpha_prod_t_sqrt[1:] * x_0_pred_batch[:-1]
+                    )
+            else:
+                x_0_pred_out = x_0_pred_batch
+                self.x_t_latent_buffer = None
+        else:
+            self.init_noise = x_t_latent
+            for idx, t in enumerate(self.sub_timesteps_tensor):
+                t = t.view(
+                    1,
+                ).repeat(
+                    self.frame_bff_size,
+                )
+                x_0_pred, model_pred = self.unet_step(x_t_latent, t, idx)
+                if idx < len(self.sub_timesteps_tensor) - 1:
+                    if self.do_add_noise:
+                        x_t_latent = self.alpha_prod_t_sqrt[
+                            idx + 1
+                        ] * x_0_pred + self.beta_prod_t_sqrt[
+                            idx + 1
+                        ] * torch.randn_like(
+                            x_0_pred, device=self.device, dtype=self.dtype
+                        )
+                    else:
+                        x_t_latent = self.alpha_prod_t_sqrt[idx + 1] * x_0_pred
+            x_0_pred_out = x_0_pred
+        return x_0_pred_out
+    @torch.no_grad()
+    def __call__(
+        self, x: Union[torch.Tensor, PIL.Image.Image, np.ndarray] = None
+    ) -> torch.Tensor:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        if x is not None:
+            x = self.image_processor.preprocess(x, self.height, self.width).to(
+                device=self.device, dtype=self.dtype
+            )
+            if self.similar_image_filter:
+                x = self.similar_filter(x)
+                if x is None:
+                    time.sleep(self.inference_time_ema)
+                    return self.prev_image_result
+            x_t_latent = self.encode_image(x)
+        else:
+            # TODO: check the dimension of x_t_latent
+            x_t_latent = torch.randn((1, 4, self.latent_height, self.latent_width)).to(
+                device=self.device, dtype=self.dtype
+            )
+        x_0_pred_out = self.predict_x0_batch(x_t_latent)
+        x_output = self.decode_image(x_0_pred_out).detach().clone()
+        self.prev_image_result = x_output
+        end.record()
+        torch.cuda.synchronize()
+        inference_time = start.elapsed_time(end) / 1000
+        self.inference_time_ema = 0.9 * self.inference_time_ema + 0.1 * inference_time
+        return x_output

streamv2v/tools/__init__.py ADDED Viewed

File without changes

streamv2v/tools/install-tensorrt.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import Literal, Optional
+import fire
+from packaging.version import Version
+from ..pip_utils import is_installed, run_pip, version
+import platform
+def get_cuda_version_from_torch() -> Optional[Literal["11", "12"]]:
+    try:
+        import torch
+    except ImportError:
+        return None
+    return torch.version.cuda.split(".")[0]
+def install(cu: Optional[Literal["11", "12"]] = get_cuda_version_from_torch()):
+    if cu is None or cu not in ["11", "12"]:
+        print("Could not detect CUDA version. Please specify manually.")
+        return
+    print("Installing TensorRT requirements...")
+    if is_installed("tensorrt"):
+        if version("tensorrt") < Version("9.0.0"):
+            run_pip("uninstall -y tensorrt")
+    cudnn_name = f"nvidia-cudnn-cu{cu}==8.9.4.25"
+    if not is_installed("tensorrt"):
+        run_pip(f"install {cudnn_name} --no-cache-dir")
+        run_pip(
+            "install --pre --extra-index-url https://pypi.nvidia.com tensorrt==9.0.1.post11.dev4 --no-cache-dir"
+        )
+    if not is_installed("polygraphy"):
+        run_pip(
+            "install polygraphy==0.47.1 --extra-index-url https://pypi.ngc.nvidia.com"
+        )
+    if not is_installed("onnx_graphsurgeon"):
+        run_pip(
+            "install onnx-graphsurgeon==0.3.26 --extra-index-url https://pypi.ngc.nvidia.com"
+        )
+    # if platform.system() == 'Windows' and not is_installed("pywin32"):
+    #     run_pip(
+    #         "install pywin32"
+    #     )
+    pass
+if __name__ == "__main__":
+    fire.Fire(install)