dance-monkey

Running

App Files Files Community

guardiancc commited on Sep 2, 2024

Commit

46da058

verified ·

1 Parent(s): c63eb09

Update mimicmotion/pipelines/pipeline_mimicmotion.py

Browse files

Files changed (1) hide show

mimicmotion/pipelines/pipeline_mimicmotion.py +26 -17

mimicmotion/pipelines/pipeline_mimicmotion.py CHANGED Viewed

@@ -16,11 +16,12 @@ from diffusers.schedulers import EulerDiscreteScheduler
 from diffusers.utils import BaseOutput, logging
 from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 from ..modules.pose_net import PoseNet
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def _append_dims(x, target_dims):
     """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
@@ -221,29 +222,37 @@ class MimicMotionPipeline(DiffusionPipeline):
         decode_chunk_size: int = 8):
         # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
         latents = latents.flatten(0, 1)
         latents = 1 / self.vae.config.scaling_factor * latents
         forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
         accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
-        # decode decode_chunk_size frames at a time to avoid OOM
-        frames = []
-        for i in range(0, latents.shape[0], decode_chunk_size):
-            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
             decode_kwargs = {}
             if accepts_num_frames:
-                # we only pass num_frames_in if it's expected
-                decode_kwargs["num_frames"] = num_frames_in
-            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
-            frames.append(frame.cpu())
-        frames = torch.cat(frames, dim=0)
         # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
         frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         frames = frames.float()
         return frames

 from diffusers.utils import BaseOutput, logging
 from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+import threading
 from ..modules.pose_net import PoseNet
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+import concurrent.futures
 def _append_dims(x, target_dims):
     """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
         decode_chunk_size: int = 8):
         # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
         latents = latents.flatten(0, 1)
         latents = 1 / self.vae.config.scaling_factor * latents
         forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
         accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # Função auxiliar para processar um chunk de frames
+        def process_chunk(start, end, frames_list):
             decode_kwargs = {}
             if accepts_num_frames:
+                decode_kwargs["num_frames"] = end - start
+            frame = self.vae.decode(latents[start:end], **decode_kwargs).sample
+            frames_list.append(frame.cpu())
+        threads = []
+        frames = []
+        # Dividindo o trabalho em chunks e criando threads para processá-los
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            t = threading.Thread(target=process_chunk, args=(i, i + decode_chunk_size, frames))
+            threads.append(t)
+            t.start()
+        # Aguardando todas as threads terminarem
+        for t in threads:
+            t.join()
         # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = torch.cat(frames, dim=0)
         frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # Cast para float32 para compatibilidade com bfloat16
         frames = frames.float()
         return frames