Spaces:

silentchen
/

test

Paused

App Files Files Community

silentchen commited on Mar 31, 2023

Commit

09ce3f4

1 Parent(s): 3ab28ab

Upload 13 files

Browse files

Files changed (11) hide show

app.py +243 -222
conf/net_conf.yaml +35 -0
conf/unet/config.json +36 -0
images/.DS_Store +0 -0
layout_guidance/__init__.py +0 -0
layout_guidance/inference.py +488 -0
my_model/__init__.py +0 -0
my_model/attention.py +929 -0
my_model/unet_2d_blocks.py +1612 -0
my_model/unet_2d_condition.py +389 -0
utils.py +76 -0

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
 import torch
 from omegaconf import OmegaConf
-# from gligen.task_grounded_generation import grounded_generation_box, load_ckpt, load_common_ckpt
 import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -10,7 +12,7 @@ from functools import partial
 from collections import Counter
 import math
 import gc
 from gradio import processing_utils
 from typing import Optional
@@ -184,77 +186,6 @@ class Blocks(gr.Blocks):
 '''
 inference model
 '''
-@torch.no_grad()
-def inference(task, language_instruction, grounding_instruction, inpainting_boxes_nodrop, image,
-              alpha_sample, guidance_scale, batch_size,
-              fix_seed, rand_seed, actual_mask, style_image,
-              *args, **kwargs):
-    grounding_instruction = json.loads(grounding_instruction)
-    phrase_list, location_list = [], []
-    for k, v in grounding_instruction.items():
-        phrase_list.append(k)
-        location_list.append(v)
-    placeholder_image = Image.open('images/teddy.jpg').convert("RGB")
-    image_list = [placeholder_image] * len(phrase_list)  # placeholder input for visual prompt, which is disabled
-    batch_size = int(batch_size)
-    if not 1 <= batch_size <= 4:
-        batch_size = 2
-    if style_image == None:
-        has_text_mask = 1
-        has_image_mask = 0  # then we hack above 'image_list'
-    else:
-        valid_phrase_len = len(phrase_list)
-        phrase_list += ['placeholder']
-        has_text_mask = [1] * valid_phrase_len + [0]
-        image_list = [placeholder_image] * valid_phrase_len + [style_image]
-        has_image_mask = [0] * valid_phrase_len + [1]
-        location_list += [[0.0, 0.0, 1, 0.01]]  # style image grounding location
-    if task == 'Grounded Inpainting':
-        alpha_sample = 1.0
-    instruction = dict(
-        prompt=language_instruction,
-        phrases=phrase_list,
-        images=image_list,
-        locations=location_list,
-        alpha_type=[alpha_sample, 0, 1.0 - alpha_sample],
-        has_text_mask=has_text_mask,
-        has_image_mask=has_image_mask,
-        save_folder_name=language_instruction,
-        guidance_scale=guidance_scale,
-        batch_size=batch_size,
-        fix_seed=bool(fix_seed),
-        rand_seed=int(rand_seed),
-        actual_mask=actual_mask,
-        inpainting_boxes_nodrop=inpainting_boxes_nodrop,
-    )
-    get_model = partial(instance.get_model,
-                        batch_size=batch_size,
-                        instruction=language_instruction,
-                        phrase_list=phrase_list)
-    with torch.autocast(device_type='cuda', dtype=torch.float16):
-        if task == 'Grounded Generation':
-            if style_image == None:
-                return grounded_generation_box(get_model('base'), instruction, *args, **kwargs)
-            else:
-                return grounded_generation_box(get_model('style'), instruction, *args, **kwargs)
-        elif task == 'Grounded Inpainting':
-            assert image is not None
-            instruction['input_image'] = image.convert("RGB")
-            return grounded_generation_box(get_model('inpaint'), instruction, *args, **kwargs)
 def draw_box(boxes=[], texts=[], img=None):
     if len(boxes) == 0 and img is None:
         return None
@@ -275,6 +206,106 @@ def draw_box(boxes=[], texts=[], img=None):
                   fill=(255, 255, 255))
     return img
 def get_concat(ims):
     if len(ims) == 1:
@@ -297,13 +328,11 @@ def auto_append_grounding(language_instruction, grounding_texts):
     return language_instruction
-def generate(task, language_instruction, grounding_texts, sketch_pad,
-             alpha_sample, guidance_scale, batch_size,
-             fix_seed, rand_seed, use_actual_mask, append_grounding, style_cond_image,
              state):
     if 'boxes' not in state:
         state['boxes'] = []
     boxes = state['boxes']
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     # assert len(boxes) == len(grounding_texts)
@@ -315,44 +344,19 @@ Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(groun
         grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
     boxes = (np.asarray(boxes) / 512).tolist()
     grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
-    image = None
-    actual_mask = None
-    if task == 'Grounded Inpainting':
-        image = state.get('original_image', sketch_pad['image']).copy()
-        image = center_crop(image)
-        image = Image.fromarray(image)
-        if use_actual_mask:
-            actual_mask = sketch_pad['mask'].copy()
-            if actual_mask.ndim == 3:
-                actual_mask = actual_mask[..., 0]
-            actual_mask = center_crop(actual_mask, tgt_size=(64, 64))
-            actual_mask = torch.from_numpy(actual_mask == 0).float()
-        if state.get('inpaint_hw', None):
-            boxes = np.asarray(boxes) * 0.9 + 0.05
-            boxes = boxes.tolist()
-            grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes) if obj != 'auto'})
-    if append_grounding:
-        language_instruction = auto_append_grounding(language_instruction, grounding_texts)
-    gen_images, gen_overlays = inference(
-        task, language_instruction, grounding_instruction, boxes, image,
-        alpha_sample, guidance_scale, batch_size,
-        fix_seed, rand_seed, actual_mask, style_cond_image, clip_model=clip_model,
-    )
-    for idx, gen_image in enumerate(gen_images):
-        if task == 'Grounded Inpainting' and state.get('inpaint_hw', None):
-            hw = min(*state['original_image'].shape[:2])
-            gen_image = sized_center_fill(state['original_image'].copy(), np.array(gen_image.resize((hw, hw))), hw, hw)
-            gen_image = Image.fromarray(gen_image)
-        gen_images[idx] = gen_image
     blank_samples = batch_size % 2 if batch_size > 1 else 0
     gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
@@ -401,35 +405,18 @@ def center_crop(img, HW=None, tgt_size=(512, 512)):
     return np.array(img)
-def draw(task, input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
         mask = mask[..., 0]
     image_scale = 1.0
-    # resize trigger
-    if task == "Grounded Inpainting":
-        mask_cond = mask.sum() == 0
-        # size_cond = mask.shape != (512, 512)
-        if mask_cond and 'original_image' not in state:
-            image = Image.fromarray(image)
-            width, height = image.size
-            scale = 600 / min(width, height)
-            image = image.resize((int(width * scale), int(height * scale)))
-            state['original_image'] = np.array(image).copy()
-            image_scale = float(height / width)
-            return [None, new_image_trigger + 1, image_scale, state]
-        else:
-            original_image = state['original_image']
-            H, W = original_image.shape[:2]
-            image_scale = float(H / W)
     mask = binarize(mask)
     if mask.shape != (512, 512):
         # assert False, "should not receive any non- 512x512 masks."
@@ -444,13 +431,10 @@ def draw(task, input, grounding_texts, new_image_trigger, state):
     if type(mask) != np.ndarray:
         mask = np.array(mask)
-    if mask.sum() == 0 and task != "Grounded Inpainting":
         state = {}
-    if task != 'Grounded Inpainting':
-        image = None
-    else:
-        image = Image.fromarray(image)
     if 'boxes' not in state:
         state['boxes'] = []
@@ -488,7 +472,6 @@ def draw(task, input, grounding_texts, new_image_trigger, state):
         box_image_resize = np.array(box_image.resize((inpaint_hw, inpaint_hw)))
         original_image = state['original_image'].copy()
         box_image = sized_center_fill(original_image, box_image_resize, inpaint_hw, inpaint_hw)
-    print(box_image, new_image_trigger, image_scale, state)
     return [box_image, new_image_trigger, image_scale, state]
@@ -518,6 +501,37 @@ css = """
     cursor: pointer;
     text-decoration: none;
 }
 """
 rescale_js = """
@@ -536,71 +550,84 @@ function(x) {
 with Blocks(
         css=css,
         analytics_enabled=False,
-        title="GLIGen demo",
 ) as main:
     description = """<p style="text-align: center; font-weight: bold;">
         <span style="font-size: 28px">Layout Guidance</span>
         <br>
         <span style="font-size: 18px" id="paper-info">
-            [<a href="https://gligen.github.io" target="_blank">Project Page</a>]
-            [<a href="https://arxiv.org/abs/2301.07093" target="_blank">Paper</a>]
-            [<a href="https://github.com/gligen/GLIGEN" target="_blank">GitHub</a>]
-            [<a href="https://huggingface.co/spaces/gligen/demo_legacy" target="_blank">Mirror</a>]
         </span>
     </p>
     """
     gr.HTML(description)
-    with gr.Row():
-        with gr.Column(scale=4):
-            sketch_pad_trigger = gr.Number(value=0, visible=False)
-            sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
-            init_white_trigger = gr.Number(value=0, visible=False)
-            image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
-            new_image_trigger = gr.Number(value=0, visible=False)
-            # task = gr.Radio(
-            #     choices=["Grounded Generation", 'Grounded Inpainting'],
-            #     type="value",
-            #     value="Grounded Generation",
-            #     label="Task",
-            # )
-            language_instruction = gr.Textbox(
-                label="Text Caption",
-            )
-            grounding_instruction = gr.Textbox(
-                label="Grounding instruction (Separated by semicolon)",
-            )
-            with gr.Row():
-                sketch_pad = ImageMask(label="Sketch Pad", elem_id="img2img_image")
-                out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
-            with gr.Row():
-                clear_btn = gr.Button(value='Clear')
-                gen_btn = gr.Button(value='Generate')
-            with gr.Accordion("Advanced Options", open=False):
-                with gr.Column():
-                    Loss_scale = gr.Slider(minimum=0, maximum=500, step=5, value=30,
-                                             label="Loss Scale Factor")
-                    guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale")
-                    batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=2, label="Number of Samples")
-                    max_iter = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Max Iteration per Step")
-                    loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss Threshold")
-                    max_step = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Max Step of Backward Guidance")
-                    # append_grounding = gr.Checkbox(value=True, label="Append grounding instructions to the caption")
-                    # use_actual_mask = gr.Checkbox(value=False, label="Use actual mask for inpainting", visible=False)
-                    with gr.Row():
-                        fix_seed = gr.Checkbox(value=True, label="Fixed seed")
-                        rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=0, label="Seed")
-        with gr.Column(scale=4):
-            gr.HTML('<span style="font-size: 20px; font-weight: bold">Generated Images</span>')
-            with gr.Row():
-                out_gen_1 = gr.Image(type="pil", visible=True, show_label=False, label="Generated Image")
-                out_gen_2 = gr.Image(type="pil", visible=True, show_label=False)
-            with gr.Row():
-                out_gen_3 = gr.Image(type="pil", visible=False, show_label=False)
-                out_gen_4 = gr.Image(type="pil", visible=False, show_label=False)
         state = gr.State({})
@@ -658,28 +685,22 @@ with Blocks(
             queue=False)
         sketch_pad.edit(
             draw,
-            inputs=[sketch_pad, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
             outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
             queue=False,
         )
         grounding_instruction.change(
             draw,
-            inputs=[sketch_pad, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
             outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
             queue=False,
         )
         clear_btn.click(
             clear,
             inputs=[sketch_pad_trigger, sketch_pad_trigger, batch_size, state],
-            outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, out_gen_2, out_gen_3,
-                     out_gen_4, state],
             queue=False)
-        # task.change(
-        #     partial(clear, switch_task=True),
-        #     inputs=[task, sketch_pad_trigger, batch_size, state],
-        #     outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, out_gen_2, out_gen_3,
-        #              out_gen_4, state],
-        #     queue=False)
         sketch_pad_trigger.change(
             controller.init_white,
             inputs=[init_white_trigger],
@@ -690,29 +711,28 @@ with Blocks(
             inputs=[state],
             outputs=[sketch_pad, state],
             queue=False)
-        batch_size.change(
-            controller.change_n_samples,
-            inputs=[batch_size],
-            outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4],
-            queue=False)
-        batch_size.change(
-            controller.change_n_samples,
-            inputs=[batch_size],
-            outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4],
-            queue=False)
         gen_btn.click(
             generate,
             inputs=[
-                language_instruction, language_instruction, grounding_instruction, sketch_pad,
-                loss_threshold, guidance_scale, batch_size,
-                fix_seed, rand_seed,
                 max_step,
                 Loss_scale, max_iter,
                 state,
             ],
-            outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4, state],
             queue=True
         )
         sketch_pad_resize_trigger.change(
@@ -732,13 +752,13 @@ with Blocks(
         gr.Examples(
             examples=[
                 [
-                    "images/input.png",
                     "A hello kitty toy is playing with a purple ball.",
                     "hello kitty;ball",
                     "images/hello_kitty_results.png"
                 ],
             ],
-            inputs=[sketch_pad, language_instruction, grounding_instruction, out_gen_1],
             outputs=None,
             fn=None,
             cache_examples=False,
@@ -746,3 +766,4 @@ with Blocks(
 main.queue(concurrency_count=1, api_open=False)
 main.launch(share=False, show_api=False, show_error=True)

 import gradio as gr
 import torch
 from omegaconf import OmegaConf
+# from layout_guidance.inference import inference
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, LMSDiscreteScheduler
+from my_model import unet_2d_condition
 import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from collections import Counter
 import math
 import gc
+from utils import compute_ca_loss
 from gradio import processing_utils
 from typing import Optional
 '''
 inference model
 '''
 def draw_box(boxes=[], texts=[], img=None):
     if len(boxes) == 0 and img is None:
         return None
                   fill=(255, 255, 255))
     return img
+with open('./conf/unet/config.json') as f:
+    unet_config = json.load(f)
+unet = unet_2d_condition.UNet2DConditionModel(**unet_config).from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="unet")
+tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder")
+vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+attn_map = None
+cfg = OmegaConf.load('./conf/net_conf.yaml')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+unet.to(device)
+text_encoder.to(device)
+vae.to(device)
+def inference(device, unet, vae, tokenizer, text_encoder, prompt, cfg,attn_map, bboxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
+    uncond_input = tokenizer(
+        [""] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
+    input_ids = tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0].unsqueeze(0).to(device)
+    # text_embeddings = text_encoder(input_ids)[0]
+    text_embeddings = torch.cat([uncond_embeddings, text_encoder(input_ids)[0]])
+    # text_embeddings[1, 1, :] = text_embeddings[1, 2, :]
+    generator = torch.manual_seed(rand_seed)  # Seed generator to create the inital latent noise
+    latents = torch.randn(
+        (batch_size, 4, 64, 64),
+        generator=generator,
+    ).to(device)
+    noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+    # generator = torch.Generator("cuda").manual_seed(1024)
+    noise_scheduler.set_timesteps(51)
+    latents = latents * noise_scheduler.init_noise_sigma
+    loss = torch.tensor(10000)
+    for index, t in enumerate(noise_scheduler.timesteps):
+        iteration = 0
+        while loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < max_index_step:
+            latents = latents.requires_grad_(True)
+            # latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = latents
+            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
+            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down, _, _, _ = \
+                unet(latent_model_input, t, index, encoder_hidden_states=text_encoder(input_ids)[0], attn_map=attn_map,
+                     cfg=cfg)
+            # update latents with guidence from gaussian blob
+            loss = compute_ca_loss(attn_map_integrated_mid, attn_map_integrated_up, bboxes=bboxes,
+                                   object_positions=object_positions) * loss_scale
+            print(loss.item() / loss_scale)
+            grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
+            latents = latents - grad_cond * noise_scheduler.sigmas[index] ** 2
+            iteration += 1
+            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
+            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down, _, _, _ = \
+                unet(latent_model_input, t, index, encoder_hidden_states=text_embeddings, attn_map=attn_map, cfg=cfg)
+            noise_pred = noise_pred.sample
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+            torch.cuda.empty_cache()
+    with torch.no_grad():
+        print("decode image")
+        latents = 1 / 0.18215 * latents
+        image = vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+        images = (image * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
 def get_concat(ims):
     if len(ims) == 1:
     return language_instruction
+def generate(language_instruction, grounding_texts, sketch_pad,
+             loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
              state):
     if 'boxes' not in state:
         state['boxes'] = []
     boxes = state['boxes']
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     # assert len(boxes) == len(grounding_texts)
         grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
     boxes = (np.asarray(boxes) / 512).tolist()
+    boxes = [[box] for box in boxes]
     grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
+    language_instruction_list = language_instruction.strip('.').split(' ')
+    object_positions = []
+    for obj in grounding_texts:
+        obj_position = []
+        for word in obj.split(' '):
+            obj_first_index = language_instruction_list.index(word) + 1
+            obj_position.append(obj_first_index)
+        object_positions.append(obj_position)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    gen_images = inference(device, unet, vae, tokenizer, text_encoder, language_instruction, cfg, attn_map, boxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_step, rand_seed, guidance_scale)
     blank_samples = batch_size % 2 if batch_size > 1 else 0
     gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
     return np.array(img)
+def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
         mask = mask[..., 0]
     image_scale = 1.0
     mask = binarize(mask)
     if mask.shape != (512, 512):
         # assert False, "should not receive any non- 512x512 masks."
     if type(mask) != np.ndarray:
         mask = np.array(mask)
+    if mask.sum() == 0:
         state = {}
+    image = None
     if 'boxes' not in state:
         state['boxes'] = []
         box_image_resize = np.array(box_image.resize((inpaint_hw, inpaint_hw)))
         original_image = state['original_image'].copy()
         box_image = sized_center_fill(original_image, box_image_resize, inpaint_hw, inpaint_hw)
     return [box_image, new_image_trigger, image_scale, state]
     cursor: pointer;
     text-decoration: none;
 }
+.tooltip {
+    color: #555;
+    position: relative;
+    display: inline-block;
+    cursor: pointer;
+}
+.tooltip .tooltiptext {
+    visibility: hidden;
+    width: 400px;
+    background-color: #555;
+    color: #fff;
+    text-align: center;
+    padding: 5px;
+    border-radius: 5px;
+    position: absolute;
+    z-index: 1; /* Set z-index to 1 */
+    left: 10px;
+    top: 100%;
+    opacity: 0;
+    transition: opacity 0.3s;
+}
+.tooltip:hover .tooltiptext {
+    visibility: visible;
+    opacity: 1;
+    z-index: 9999; /* Set a high z-index value when hovering */
+}
 """
 rescale_js = """
 with Blocks(
         css=css,
         analytics_enabled=False,
+        title="Layout-Guidance demo",
 ) as main:
     description = """<p style="text-align: center; font-weight: bold;">
         <span style="font-size: 28px">Layout Guidance</span>
         <br>
         <span style="font-size: 18px" id="paper-info">
+            [<a href=" " target="_blank">Project Page</a>]
+            [<a href=" " target="_blank">Paper</a>]
+            [<a href=" " target="_blank">GitHub</a>]
         </span>
     </p>
     """
     gr.HTML(description)
+    with gr.Column():
+        language_instruction = gr.Textbox(
+            label="Text Prompt",
+        )
+        grounding_instruction = gr.Textbox(
+            label="Grounding instruction (Separated by semicolon)",
+        )
+        sketch_pad_trigger = gr.Number(value=0, visible=False)
+        sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
+        init_white_trigger = gr.Number(value=0, visible=False)
+        image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
+        new_image_trigger = gr.Number(value=0, visible=False)
+        with gr.Row():
+            sketch_pad = ImageMask(label="Sketch Pad", elem_id="img2img_image")
+            out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
+            out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
+            # out_gen_2 = gr.Image(type="pil", visible=True, label="Generated Image")
+            # out_gen_3 = gr.Image(type="pil", visible=True, show_label=False)
+            # out_gen_4 = gr.Image(type="pil", visible=True, show_label=False)
+        with gr.Row():
+            clear_btn = gr.Button(value='Clear')
+            gen_btn = gr.Button(value='Generate')
+            # clear_btn = gr.Button(value='Clear')
+            # clear_btn = gr.Button(value='Clear')
+        with gr.Accordion("Advanced Options", open=False):
+            with gr.Column():
+                description = """<div class="tooltip">Loss Scale Factor &#9432
+                    <span class="tooltiptext">The scale factor of the backward guidance loss. The larger it is, the better control we get while it sometimes losses fidelity. </span>
+                    </div>
+                    <div class="tooltip">Guidance Scale &#9432
+                    <span class="tooltiptext">The scale factor of classifier-free guidance. </span>
+                    </div>
+                    <div class="tooltip" >Max Iteration per Step &#9432
+                    <span class="tooltiptext">The max iterations of backward guidance in each diffusion inference process.</span>
+                    </div>
+                    <div class="tooltip" >Loss Threshold &#9432
+                    <span class="tooltiptext">The threshold of loss. If the loss computed by cross-attention map is smaller then the threshold, the backward guidance is stopped. </span>
+                    </div>
+                    <div class="tooltip" >Max Step of Backward Guidance &#9432
+                    <span class="tooltiptext">The max steps of backward guidance in diffusion inference process.</span>
+                    </div>
+                """
+                gr.HTML(description)
+                Loss_scale = gr.Slider(minimum=0, maximum=500, step=5, value=30,label="Loss Scale Factor")
+                guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale")
+                batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=1, label="Number of Samples", visible=False)
+                max_iter = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Max Iteration per Step")
+                loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss Threshold")
+                max_step = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Max Step of Backward Guidance")
+                    # fix_seed = gr.Checkbox(value=True, label="Fixed seed")
+                rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=445, label="Random Seed")
+        # with gr.Column(scale=4):
+        #     gr.HTML('<span style="font-size: 20px; font-weight: bold">Generated Images</span>')
+        #     with gr.Row():
+        #         out_gen_1 = gr.Image(type="pil", visible=True, show_label=False, label="Generated Image")
+        #         out_gen_2 = gr.Image(type="pil", visible=True, show_label=False)
+        #     with gr.Row():
+        #         out_gen_3 = gr.Image(type="pil", visible=False, show_label=False)
+        #         out_gen_4 = gr.Image(type="pil", visible=False, show_label=False)
         state = gr.State({})
             queue=False)
         sketch_pad.edit(
             draw,
+            inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
             outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
             queue=False,
         )
         grounding_instruction.change(
             draw,
+            inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
             outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
             queue=False,
         )
         clear_btn.click(
             clear,
             inputs=[sketch_pad_trigger, sketch_pad_trigger, batch_size, state],
+            outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, state],
             queue=False)
         sketch_pad_trigger.change(
             controller.init_white,
             inputs=[init_white_trigger],
             inputs=[state],
             outputs=[sketch_pad, state],
             queue=False)
+        # batch_size.change(
+        #     controller.change_n_samples,
+        #     inputs=[batch_size],
+        #     outputs=[out_gen_1, out_gen_2],
+        #     queue=False)
+        # batch_size.change(
+        #     controller.change_n_samples,
+        #     inputs=[batch_size],
+        #     outputs=[out_gen_1, out_gen_2],
+        #     queue=False)
         gen_btn.click(
             generate,
             inputs=[
+                language_instruction, grounding_instruction, sketch_pad,
+                loss_threshold, guidance_scale, batch_size, rand_seed,
                 max_step,
                 Loss_scale, max_iter,
                 state,
             ],
+            outputs=[out_gen_1, state],
             queue=True
         )
         sketch_pad_resize_trigger.change(
         gr.Examples(
             examples=[
                 [
+                    # "images/input.png",
                     "A hello kitty toy is playing with a purple ball.",
                     "hello kitty;ball",
                     "images/hello_kitty_results.png"
                 ],
             ],
+            inputs=[language_instruction, grounding_instruction, out_gen_1],
             outputs=None,
             fn=None,
             cache_examples=False,
 main.queue(concurrency_count=1, api_open=False)
 main.launch(share=False, show_api=False, show_error=True)

conf/net_conf.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+training:
+  use_ema: True
+  batch_size: 8
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 1e-2
+  adam_epsilon: 1e-08
+  lr_scheduler: constant
+  lr_warmup_steps: 0
+  max_train_steps: 50000
+  text_finetune_step: 50
+  unet_finetune_step: 50
+  alpha: 0.1
+  min_lr: 1e-6
+  warmup_epochs: 0
+  num_train_epochs: 300
+  accumulate_step: 1
+  lr: 1e-6
+  resume: ' '
+  down_attn_shift: -1
+  down_attn_map: -1
+  mid_attn_shift: -1
+  mid_attn_map: -1
+  up_attn_shift: -1
+  up_attn_map: -1
+inference:
+  loss_scale: 30
+  batch_size: 1
+  loss_threshold: 0.2
+  max_iter: 5
+  index_step: 10
+  start_pair: 800
+  iteration_interval: 400
+  infer_iter: 0

conf/unet/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ]
+}

images/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

layout_guidance/__init__.py ADDED Viewed

File without changes

layout_guidance/inference.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# !pip install diffusers["torch"] transformers
+import hydra
+import torch
+import yaml
+from diffusers import StableDiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+import torch.nn.functional as F
+from PIL import Image, ImageDraw, ImageFont
+import matplotlib.pyplot as plt
+import torch.nn as nn
+import time
+from accelerate import Accelerator
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter
+from omegaconf import DictConfig, OmegaConf
+from datetime import datetime
+import logging
+import itertools
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from diffusers import LMSDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from torch import autocast
+from torch.cuda.amp import GradScaler
+import pdb
+import math
+from my_model import unet_2d_condition
+from typing import Iterable, Optional
+import os
+import json
+import numpy as np
+import scipy
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+def unfreeze_params(params):
+    for param in params:
+        param.requires_grad = True
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+    def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999):
+        parameters = list(parameters)
+        print("list parameters")
+        self.shadow_params = [p.clone().detach() for p in parameters]
+        print("finish clone parameters")
+        self.decay = decay
+        self.optimization_step = 0
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        value = (1 + optimization_step) / (10 + optimization_step)
+        return 1 - min(self.decay, value)
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+        self.optimization_step += 1
+        self.decay = self.get_decay(self.optimization_step)
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                tmp = self.decay * (s_param - param)
+                s_param.sub_(tmp)
+            else:
+                s_param.copy_(param)
+        torch.cuda.empty_cache()
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+    def to(self, device=None, dtype=None) -> None:
+        r"""c"""
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+def compute_visor_loss(attn_maps_mid, attn_maps_up, obj_a_positions, obj_b_positions, relationship):
+    loss = 0
+    for attn_map_integrated in attn_maps_mid:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        #
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        weight_matrix_x = torch.zeros(size=(H, W)).cuda()
+        weight_matrix_y = torch.zeros(size=(H, W)).cuda()
+        for x_indx in range(W):
+            weight_matrix_x[:, x_indx] = x_indx
+        for y_indx in range(H):
+            weight_matrix_y[y_indx, :] = y_indx
+        # for obj_idx in range(object_number):
+        #
+        #     bbox = bboxes[obj_idx]
+        obj_a_avg_x_total = 0
+        obj_a_avg_y_total = 0
+        for obj_a_position in obj_a_positions:
+            ca_map_obj = attn_map[:, :, obj_a_position].reshape(b, H, W)
+            # pdb.set_trace()
+            obj_a_avg_x = (ca_map_obj * weight_matrix_x.unsqueeze(0)).reshape(b, -1).sum(-1)/ca_map_obj.reshape(b,-1).sum(-1)
+            obj_a_avg_y = (ca_map_obj * weight_matrix_y.unsqueeze(0)).reshape(b, -1).sum(-1)/ca_map_obj.reshape(b,-1).sum(-1)
+            obj_a_avg_x_total += obj_a_avg_x
+            obj_a_avg_y_total += obj_a_avg_y
+        obj_a_avg_x_total = (obj_a_avg_x_total/len(obj_a_positions)).mean() / W
+        obj_a_avg_y_total = (obj_a_avg_y_total/len(obj_a_positions)).mean() / H
+        print('mid: obj_a_avg_x_total', obj_a_avg_x_total)
+        obj_b_avg_x_total = 0
+        obj_b_avg_y_total = 0
+        for obj_b_position in obj_b_positions:
+            ca_map_obj = attn_map[:, :, obj_b_position].reshape(b, H, W)
+            obj_b_avg_x = (ca_map_obj * weight_matrix_x.unsqueeze(0)).reshape(b, -1).sum(-1)/ca_map_obj.reshape(b,-1).sum(-1)
+            obj_b_avg_y = (ca_map_obj * weight_matrix_y.unsqueeze(0)).reshape(b, -1).sum(-1)/ca_map_obj.reshape(b,-1).sum(-1)
+            obj_b_avg_x_total += obj_b_avg_x
+            obj_b_avg_y_total += obj_b_avg_y
+        obj_b_avg_x_total = (obj_b_avg_x_total/len(obj_b_positions)).mean() / W
+        obj_b_avg_y_total = (obj_b_avg_y_total/len(obj_b_positions)).mean() / H
+        print('mid: obj_b_avg_x_total', obj_b_avg_x_total)
+        if relationship == 0:
+            loss += (obj_b_avg_x_total - obj_a_avg_x_total)
+        elif relationship == 1:
+            loss += (obj_a_avg_x_total - obj_b_avg_x_total)
+        elif relationship == 2:
+            loss += (obj_b_avg_y_total - obj_a_avg_y_total)
+        elif relationship == 3:
+            loss += (obj_a_avg_y_total - obj_b_avg_y_total)
+    for attn_map_integrated in attn_maps_up[0]:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        weight_matrix_x = torch.zeros(size=(H, W)).cuda()
+        weight_matrix_y = torch.zeros(size=(H, W)).cuda()
+        for x_indx in range(W):
+            weight_matrix_x[:, x_indx] = x_indx
+        for y_indx in range(H):
+            weight_matrix_y[y_indx, :] = y_indx
+        # for obj_idx in range(object_number):
+        #
+        #     bbox = bboxes[obj_idx]
+        obj_a_avg_x_total = 0
+        obj_a_avg_y_total = 0
+        for obj_a_position in obj_a_positions:
+            ca_map_obj = attn_map[:, :, obj_a_position].reshape(b, H, W)
+            obj_a_avg_x = (ca_map_obj * weight_matrix_x.unsqueeze(0)).reshape(b, -1).sum(-1) / ca_map_obj.reshape(b, -1).sum(-1)
+            obj_a_avg_y = (ca_map_obj * weight_matrix_y.unsqueeze(0)).reshape(b, -1).sum(-1) / ca_map_obj.reshape(b, -1).sum(-1)
+            obj_a_avg_x_total += obj_a_avg_x
+            obj_a_avg_y_total += obj_a_avg_y
+        obj_a_avg_x_total = (obj_a_avg_x_total / len(obj_a_positions)).mean() / W
+        obj_a_avg_y_total = (obj_a_avg_y_total / len(obj_a_positions)).mean() / H
+        print('up: obj_a_avg_x_total', obj_a_avg_x_total)
+        obj_b_avg_x_total = 0
+        obj_b_avg_y_total = 0
+        for obj_b_position in obj_b_positions:
+            ca_map_obj = attn_map[:, :, obj_b_position].reshape(b, H, W)
+            obj_b_avg_x = (ca_map_obj * weight_matrix_x.unsqueeze(0)).reshape(b, -1).sum(-1) / ca_map_obj.reshape(b, -1).sum(-1)
+            obj_b_avg_y = (ca_map_obj * weight_matrix_y.unsqueeze(0)).reshape(b, -1).sum(-1) / ca_map_obj.reshape(b, -1).sum(-1)
+            obj_b_avg_x_total += obj_b_avg_x
+            obj_b_avg_y_total += obj_b_avg_y
+        obj_b_avg_x_total = (obj_b_avg_x_total / len(obj_b_positions)).mean() / W
+        obj_b_avg_y_total = (obj_b_avg_y_total / len(obj_b_positions)).mean() / H
+        print('up: obj_b_avg_x_total', obj_b_avg_x_total)
+        if relationship == 0:
+            loss += (obj_a_avg_x_total - obj_b_avg_x_total)
+        elif relationship == 1:
+            loss += (obj_b_avg_x_total - obj_a_avg_x_total)
+        elif relationship == 2:
+            loss += (obj_a_avg_y_total - obj_b_avg_y_total)
+        elif relationship == 3:
+            loss += (obj_b_avg_y_total - obj_a_avg_y_total)
+    loss = loss / (len(attn_maps_up[0]) + len(attn_maps_mid))
+    return loss
+@hydra.main(version_base=None, config_path="conf", config_name="config_visor_box")
+def train(cfg: DictConfig):
+    # fix the randomness of torch
+    print(cfg)
+    with open('./conf/unet/origin_config.json') as f:
+        unet_config = json.load(f)
+    unet = unet_2d_condition.UNet2DConditionModel(**unet_config)
+    # ckp = torch.load('/Users/shil5883/Downloads/diffusion_pytorch_model.bin', map_location='cpu')
+    # prev_attn_map = torch.load('./attn_map.ckp', map_location='cpu')
+    ckp = torch.load('/work/minghao/chess_gen/diffusion_pytorch_model.bin', map_location='cpu')
+    prev_attn_map = torch.load('/work/minghao/chess_gen/visual_attn/2023-02-02/15-05-51/epoch_100_sche_constant_lr_1e-06_ac_1/attn_map.ckp', map_location='cpu')
+    # prev_attn_map = torch.load('/work/minghao/chess_gen/visual_attn/2023-01-16/18-58-12/epoch_100_sche_constant_lr_1e-06_ac_1/attn_map.ckp', map_location='cpu')
+    unet.load_state_dict(ckp)
+    unet_original = UNet2DConditionModel(**unet_config)
+    unet_original.load_state_dict(ckp)
+    date_now, time_now = datetime.now().strftime("%Y-%m-%d,%H-%M-%S").split(',')
+    # cfg.general.save_path = os.path.join(cfg.general.save_path, date_now, time_now)
+    # if not os.path.exists(cfg.general.save_path ):
+    #     os.makedirs(cfg.general.save_path)
+    # cfg.general.save_path
+    mixed_precision = 'fp16' if torch.cuda.is_available() else 'no'
+    accelerator = Accelerator(
+        gradient_accumulation_steps=cfg.training.accumulate_step,
+        mixed_precision=mixed_precision,
+        log_with="tensorboard",
+        logging_dir='./',
+    )
+    # initialize dataset and dataloader
+    if accelerator.is_main_process:
+        print("Loading the dataset!!!!!")
+    tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
+    # train_dataset = ICLEVERDataset(cfg.data.data_path, tokenizer, cfg, prefix='train')
+    # val_dataset = ICLEVERDataset(cfg.data.data_path, tokenizer, cfg, prefix='val')
+    # train_loader = DataLoader(train_dataset, batch_size=cfg.training.batch_size, shuffle=True, num_workers=2, pin_memory=False)
+    # val_loader = DataLoader(val_dataset, batch_size=cfg.training.batch_size * 2, shuffle=True, num_workers=2, pin_memory=False)
+    if accelerator.is_main_process:
+        print("Complete loading the dataset!!!!!")
+    if accelerator.is_main_process:
+        print("Complete load the noise scheduler!!!!!")
+    with open("config.yaml", "w") as f:
+        OmegaConf.save(cfg, f)
+    if not os.path.exists(cfg.general.save_path) and accelerator.is_main_process:
+        os.makedirs(cfg.general.save_path)
+    if accelerator.is_main_process:
+        print("saved load the noise scheduler!!!!!")
+    # Move unet to device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # load pretrained models and schedular
+    text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+    # boards_embedder.to(device)
+    if accelerator.is_main_process:
+        print("move the model to device!!!!!")
+    # Keep vae and unet in eval model as we don't train these
+    # Initialize the optimizer
+    cfg.training.lr = (
+            cfg.training.lr * cfg.training.accumulate_step * cfg.training.batch_size * accelerator.num_processes
+    )
+    # Move vae and unet to device
+    vae.to(device)
+    unet.to(device)
+    text_encoder.to(device)
+    # prev_attn_map.to(device)
+    unet_original.to(device)
+    vae.eval()
+    unet.eval()
+    text_encoder.eval()
+    unet_original.eval()
+    # tokenizer.to(device)
+    # if accelerator.is_main_process:
+    print("prepare the accelerator module at process: {}!!!!!".format(accelerator.process_index))
+    # unet = accelerator.prepare(unet)
+    print("done the accelerator module at process: {}!!!!!".format(accelerator.process_index))
+    # Create EMA for the unet.
+    # if cfg.training.use_ema:
+    #     ema_unet = EMAModel(unet.parameters())
+    #     ema_encoder = EMAModel(boards_embedder.parameters())
+    ema_unet = None
+    # print(start_ema)
+    if cfg.training.use_ema:
+        if accelerator.is_main_process:
+            print("Using the EMA model!!!!!")
+        print("start EMA at process: {}!!!!!".format(accelerator.process_index))
+        ema_unet = EMAModel(unet.parameters())
+        # ema_encoder = EMAModel(boards_embedder.parameters())
+    # prompt = 'A traffic light below a sink'
+    templates = ['{} to the left of {}', '{} to the right of {}', '{} above {}', '{} below {}']
+    bboxes_template = [[0.0, 0.0, 0.5, 1.0], [0.0, 0.0, 1.0, 0.5], [0.5, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0]]
+    bboxes_template_list = [[0, 2], [2, 0], [1, 3], [3, 1]]
+    iteration_start = cfg.inference.start_pair
+    iteration_now = iteration_start
+    iteration_interval = cfg.inference.iteration_interval
+    with open('./coco_paris.txt', 'r') as f:
+        image_pairs = f.readlines()
+        for image_pair in tqdm(image_pairs[iteration_start: iteration_start + iteration_interval]):
+            obj_a, obj_b = image_pair.strip().split(',')[0], image_pair.strip().split(',')[1]
+            obj_a = 'A {}'.format(obj_a) if obj_a[0] not in ['a', 'e', 'i', 'o', 'u'] else 'An {}'.format(obj_a)
+            obj_b = 'a {}'.format(obj_b) if obj_b[0] not in ['a', 'e', 'i', 'o', 'u'] else 'an {}'.format(obj_b)
+            for idx, template in enumerate(templates):
+                prompt = template.format(obj_a, obj_b)
+                obj_a_len = len(obj_a.split(' ')) - 1
+                obj_a_position = [2] if obj_a_len == 1 else [2, 3]
+                obj_b_position = [obj_a_len + 1 + len(template.split(' ')) + i for i in range(len(obj_b.split(' '))-1)]
+                obj_positions = [obj_a_position, obj_b_position]
+                obj_a_boxes = [bboxes_template[bboxes_template_list[idx][0]].copy() for _ in range(len(obj_a.split(' ')) - 1)]
+                obj_b_boxes = [bboxes_template[bboxes_template_list[idx][1]].copy() for _ in range(len(obj_b.split(' ')) - 1)]
+                obj_boxes = [obj_a_boxes, obj_b_boxes]
+                print(prompt, obj_positions, obj_boxes)
+                # for infer_iter in range(1):
+                inference(device, unet, unet_original, vae, tokenizer, text_encoder, prompt, cfg, prev_attn_map, bboxes=obj_boxes, object_positions=obj_positions, infer_iter=cfg.inference.infer_iter, pair_id=iteration_now)
+            obj_b, obj_a = image_pair.strip().split(',')[0], image_pair.strip().split(',')[1]
+            obj_a = 'A {}'.format(obj_a) if obj_a[0] not in ['a', 'e', 'i', 'o', 'u'] else 'An {}'.format(obj_a)
+            obj_b = 'a {}'.format(obj_b) if obj_b[0] not in ['a', 'e', 'i', 'o', 'u'] else 'an {}'.format(obj_b)
+            for idx, template in enumerate(templates):
+                prompt = template.format(obj_a, obj_b)
+                obj_a_len = len(obj_a.split(' ')) - 1
+                obj_a_position = [2] if obj_a_len == 1 else [2, 3]
+                obj_b_position = [obj_a_len + 1 + len(template.split(' ')) + i for i in range(len(obj_b.split(' '))-1)]
+                obj_positions = [obj_a_position, obj_b_position]
+                obj_a_boxes = [bboxes_template[bboxes_template_list[idx][0]].copy() for _ in range(len(obj_a.split(' ')) - 1)]
+                obj_b_boxes = [bboxes_template[bboxes_template_list[idx][1]].copy() for _ in range(len(obj_b.split(' ')) - 1)]
+                obj_boxes = [obj_a_boxes, obj_b_boxes]
+                print(prompt, obj_positions, obj_boxes)
+                inference(device, unet, unet_original, vae, tokenizer, text_encoder, prompt, cfg, prev_attn_map, bboxes=obj_boxes, object_positions=obj_positions, infer_iter=cfg.inference.infer_iter, pair_id=iteration_now)
+            iteration_now += 1
+def compute_ca_loss(attn_maps_mid, attn_maps_up, bboxes, object_positions):
+    loss = 0
+    object_number = len(bboxes)
+    if object_number == 0:
+        return torch.tensor(0).float().cuda()
+    for attn_map_integrated in attn_maps_mid:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        #
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        # pdb.set_trace()
+        for obj_idx in range(object_number):
+            obj_loss = 0
+            mask = torch.zeros(size=(H, W)).cuda()
+            for obj_box in bboxes[obj_idx]:
+                x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
+                    int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                mask[y_min: y_max, x_min: x_max] = 1
+            for obj_position in object_positions[obj_idx]:
+                ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+                # ca_map_obj = attn_map[:, :, object_positions[obj_position]].reshape(b, H, W)
+                activation_value = (ca_map_obj * mask).reshape(b, -1).sum(dim=-1)/ca_map_obj.reshape(b, -1).sum(dim=-1)
+                obj_loss += torch.mean((1 - activation_value) ** 2)
+            loss += (obj_loss/len(object_positions[obj_idx]))
+            # print("??", obj_idx, obj_loss/len(object_positions[obj_idx]))
+        # compute loss on padding tokens
+        # activation_value = torch.zeros(size=(b, )).cuda()
+        # for obj_idx in range(object_number):
+        #     bbox = bboxes[obj_idx]
+        #     ca_map_obj = attn_map[:, :, padding_start:].reshape(b, H, W, -1)
+        #     activation_value += ca_map_obj[:, int(bbox[0] * H): int(bbox[1] * H),
+        #                        int(bbox[2] * W): int(bbox[3] * W), :].reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(dim=-1)
+        #
+        # loss += torch.mean((1 - activation_value) ** 2)
+    for attn_map_integrated in attn_maps_up[0]:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        #
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        for obj_idx in range(object_number):
+            obj_loss = 0
+            mask = torch.zeros(size=(H, W)).cuda()
+            for obj_box in bboxes[obj_idx]:
+                x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
+                    int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                mask[y_min: y_max, x_min: x_max] = 1
+            for obj_position in object_positions[obj_idx]:
+                ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+                # ca_map_obj = attn_map[:, :, object_positions[obj_position]].reshape(b, H, W)
+                activation_value = (ca_map_obj * mask).reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(
+                    dim=-1)
+                obj_loss += torch.mean((1 - activation_value) ** 2)
+            loss += (obj_loss / len(object_positions[obj_idx]))
+        # compute loss on padding tokens
+        # activation_value = torch.zeros(size=(b, )).cuda()
+        # for obj_idx in range(object_number):
+        #     bbox = bboxes[obj_idx]
+        #     ca_map_obj = attn_map[:, :,padding_start:].reshape(b, H, W, -1)
+        #     activation_value += ca_map_obj[:, int(bbox[0] * H): int(bbox[1] * H),
+        #                        int(bbox[2] * W): int(bbox[3] * W), :].reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(dim=-1)
+        #
+        # loss += torch.mean((1 - activation_value) ** 2)
+    loss = loss / (object_number * (len(attn_maps_up[0]) + len(attn_maps_mid)))
+    return loss
+def plt_all_attn_map_in_one(attn_map_integrated_list_down, attn_map_integrated_list_mid, attn_map_integrated_list_up, image, prompt, cfg, t, prefix='all'):
+    prompt_split = prompt.split(' ')
+    prompt_len = len(prompt_split) + 4
+    total_layers = len(attn_map_integrated_list_down) + len(attn_map_integrated_list_mid) + len(attn_map_integrated_list_up)
+    fig, axs = plt.subplots(nrows=total_layers+1, ncols=prompt_len, figsize=(4 * prompt_len, 4 * total_layers))
+    fig.suptitle(prompt, fontsize=32)
+    fig.tight_layout()
+    cnt = 1
+    ax = axs[0][0]
+    ax.imshow(image)
+    for prompt_idx in range(prompt_len):
+        ax = axs[0][prompt_idx]
+        ax.set_axis_off()
+    for layer, attn_map_integrated in enumerate(attn_map_integrated_list_down):
+        attn_map_uncond, attn_map = attn_map_integrated.chunk(2)
+        grid_size = int(math.sqrt(attn_map.shape[1]))
+        for prompt_idx in range(prompt_len):
+            ax = axs[cnt][prompt_idx]
+            if prompt_idx == 0:
+                ax.set_ylabel('down {}'.format(layer), rotation=0, size='large')
+            mask = attn_map.mean(dim=0)[:, prompt_idx].reshape(grid_size, grid_size).detach().cpu().numpy()
+            im = ax.imshow(mask, cmap='YlGn')
+            ax.set_axis_off()
+        cnt += 1
+    for layer, attn_map_integrated in enumerate(attn_map_integrated_list_mid):
+        attn_map_uncond, attn_map = attn_map_integrated.chunk(2)
+        grid_size = int(math.sqrt(attn_map.shape[1]))
+        for prompt_idx in range(prompt_len):
+            ax = axs[cnt][prompt_idx]
+            if prompt_idx ==0:
+                ax.set_ylabel('mid {}'.format(layer), rotation=0, size='large')
+            mask = attn_map.mean(dim=0)[:, prompt_idx].reshape(grid_size, grid_size).detach().cpu().numpy()
+            im = ax.imshow(mask, cmap='YlGn')
+            ax.set_axis_off()
+        cnt += 1
+    for layer, attn_map_integrated in enumerate(attn_map_integrated_list_up):
+        attn_map_uncond, attn_map = attn_map_integrated.chunk(2)
+        grid_size = int(math.sqrt(attn_map.shape[1]))
+        for prompt_idx in range(prompt_len):
+            ax = axs[cnt][prompt_idx]
+            if prompt_idx ==0:
+                ax.set_ylabel('up {}'.format(layer), rotation=0, size='large')
+            mask = attn_map.mean(dim=0)[:, prompt_idx].reshape(grid_size, grid_size).detach().cpu().numpy()
+            im = ax.imshow(mask, cmap='YlGn')
+            ax.set_axis_off()
+        cnt += 1
+    if not os.path.exists(cfg.general.save_path + "/{}".format(prefix)):
+        os.makedirs(cfg.general.save_path + "/{}".format(prefix))
+    plt.savefig(cfg.general.save_path + "/{}/step_{}.png".format(prefix, str(int(t)).zfill(4)))
+    # generate_video()
+    plt.close()
+if __name__=="__main__":
+    train()

my_model/__init__.py ADDED Viewed

File without changes

my_model/attention.py ADDED Viewed

	@@ -0,0 +1,929 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+import scipy
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs_coarse.
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = in_channels is not None
+        self.is_input_vectorized = num_vector_embeds is not None
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized:
+            raise ValueError(
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if self.is_input_continuous:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+    def _set_attention_slice(self, slice_size):
+        for block in self.transformer_blocks:
+            block._set_attention_slice(slice_size)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attn_map=None, attn_shift=False, obj_ids=None, relationship=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        # 1. Input
+        if self.is_input_continuous:
+            batch, channel, height, weight = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states, cross_attn_prob, save_key = block(hidden_states, context=encoder_hidden_states, timestep=timestep, attn_map=attn_map, attn_shift=attn_shift, obj_ids=obj_ids, relationship=relationship)
+        # 3. Output
+        if self.is_input_continuous:
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+            hidden_states = self.proj_out(hidden_states)
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output), cross_attn_prob, save_key
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        channels (`int`): The number of channels in the input and output.
+        num_head_channels (`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
+        eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        norm_num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
+        # define q,k,v as linear layers
+        self.query = nn.Linear(channels, channels)
+        self.key = nn.Linear(channels, channels)
+        self.value = nn.Linear(channels, channels)
+        self.rescale_output_factor = rescale_output_factor
+        self.proj_attn = nn.Linear(channels, channels, 1)
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+    def forward(self, hidden_states):
+        residual = hidden_states
+        batch, channel, height, width = hidden_states.shape
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
+        # proj to q, k, v
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+        # transpose
+        query_states = self.transpose_for_scores(query_proj)
+        key_states = self.transpose_for_scores(key_proj)
+        value_states = self.transpose_for_scores(value_proj)
+        # get scores
+        scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
+        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)  # TODO: use baddmm
+        attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
+        # compute attention output
+        hidden_states = torch.matmul(attention_probs, value_states)
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+        hidden_states = hidden_states.view(new_hidden_states_shape)
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
+        # res connect and rescale
+        hidden_states = (hidden_states + residual) / self.rescale_output_factor
+        return hidden_states
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )  # is self-attn if context is none
+        # layer norms
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+    def _set_attention_slice(self, slice_size):
+        self.attn1._slice_size = slice_size
+        self.attn2._slice_size = slice_size
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(self, hidden_states, context=None, timestep=None, attn_map=None, attn_shift=False, obj_ids=None, relationship=None):
+        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        tmp_hidden_states, cross_attn_prob, save_key = self.attn1(norm_hidden_states)
+        hidden_states = tmp_hidden_states + hidden_states
+        # 2. Cross-Attention
+        norm_hidden_states = (
+            self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+        )
+        tmp_hidden_states, cross_attn_prob, save_key = self.attn2(norm_hidden_states, context=context, attn_map=attn_map, attn_shift=attn_shift, obj_ids=obj_ids, relationship=relationship)
+        hidden_states = tmp_hidden_states + hidden_states
+        # 3. Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        return hidden_states, cross_attn_prob, save_key
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the context. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def forward(self, hidden_states, context=None, attn_map=None, use_prev_key=False, prev_key=None, mask=None, attn_shift=False, obj_ids=None, relationship=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = self.to_q(hidden_states)
+        context = context if context is not None else hidden_states
+        key = self.to_k(context)
+        value = self.to_v(context)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        if use_prev_key:
+            key = prev_key
+        value = self.reshape_heads_to_batch_dim(value)
+        # TODO(PVP) - mask is currently never used. Remember to re-implement when used
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states, attention_probs = self._attention(query, key, value, attn_map=attn_map, attn_shift=attn_shift, obj_ids=obj_ids, relationship=relationship)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states, attention_probs, key
+    def _attention(self, query, key, value, attn_map=None, attn_shift=False, attn_mask=None, obj_ids=None, relationship=None):
+        # TODO: use baddbmm for better performance
+        if query.device.type == "mps":
+            # Better performance on mps (~20-25%)
+            attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
+        else:
+            attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+        attention_probs = attention_scores.softmax(dim=-1)
+        # compute attention output
+        if query.device.type == "mps":
+            hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
+        else:
+            per_image_size = attention_probs.shape[0] // 2
+            if attn_map is not None:
+                print(attn_map.shape, attention_probs.shape)
+                # # hidden_states = torch.matmul(attention_probs, value)
+                # # print(attention_probs.shape, attn_map.shape)
+                # #
+                # b, i, j = attention_probs.shape
+                # H = W = int(math.sqrt(i))
+                # # # random_start = torch.randn(size=(b, j, i))
+                # # # random_start = (random_start/random_start.sum(-1).unsqueeze(-1)).permute(0, 2, 1).cuda()
+                # # # attention_probs[per_image_size:, :, 7:] = random_start[per_image_size:, :, 0].unsqueeze(-1)
+                # n = np.zeros((H, W))
+                # n[H//2, 1*W//4] = 1
+                # # n[3*H//4, 1*W//4] = 1
+                # # n[3*H//4, 3*W//4] = 1
+                #
+                # attention_weight_cat = torch.from_numpy(scipy.ndimage.gaussian_filter(n, sigma=H/12)).cuda().reshape(-1)
+                # # print(attention_probs[per_image_size:, :, :].shape, attention_weight_cat.shape )
+                # # attention_probs[per_image_size:, :, 4:] = attention_probs[per_image_size:, :, 4:] * attention_weight_cat.unsqueeze(-1)
+                # # attention_probs[per_image_size:, :, :] = attention_probs[per_image_size:, :, :]/attention_probs[per_image_size:, :, :].sum(dim=1).unsqueeze(1)
+                # #
+                # n = np.zeros((H, W))
+                # n[H//2, 3*W//4] = 1
+                # attention_weight_dog = torch.from_numpy(scipy.ndimage.gaussian_filter(n, sigma=H/12)).cuda().reshape(-1)
+                # #
+                # attention_weight_all = attention_weight_dog + attention_weight_cat
+                # # attention_weight_all = torch.ones(size=attention_weight_motorbike.shape)
+                # attention_weight_all_normalized = attention_weight_all/attention_weight_all.sum()
+                # # attention_probs[per_image_size:, :, 1:] = attention_weight_all_normalized.unsqueeze(-1)
+                # #
+                # attention_weight_bg = attention_weight_dog + attention_weight_cat
+                # # attention_weight_bg = torch.ones(size=attention_weight_motorbike.shape)
+                # attention_weight_all_normalized_bg = attention_weight_bg/attention_weight_bg.sum()
+                # attention_weight_all_normalized_bg_reverse = attention_weight_all_normalized_bg.max() - attention_weight_all_normalized_bg
+                # # attention_weight_all_normalized_bg_reverse = torch.ones(size=attention_weight_motorbike.shape)
+                # attention_weight_all_normalized_bg_reverse = attention_weight_all_normalized_bg_reverse/attention_weight_all_normalized_bg_reverse.sum()
+                # # attention_probs[per_image_size:, :, 0] = attention_weight_all_normalized_bg_reverse
+                # #
+                # # per_image_size = attention_probs.shape[0] // 2
+                #
+                # # attention_probs[per_image_size:] = attn_map if attn_map.shape[0] == per_image_size else attn_map[per_image_size:]
+                # # attention_probs_new = attention_probs.clone()
+                # # attention_probs[per_image_size:, :, 1] = attention_probs_new[per_image_size:, :, 3]
+                # # attention_probs[per_image_size:, :, 3] = attention_probs_new[per_image_size:, :, 1]
+            if attn_shift:
+                # print("???")
+                b, i, j = attention_probs.shape
+                H = W = int(math.sqrt(i))
+                strength = relationship['strength']
+                spatial_relationship = relationship['spatial_relationship']
+                # print(obj_ids, relationship)
+                ##### padding token one
+                if relationship['padding_token']:
+                    # print("forward with padding_token")
+                    n = np.zeros((H, W))
+                    padding_token_start = relationship['padding_start']
+                    # print(relationship)
+                    if spatial_relationship == 0:
+                        n[H // 2, 1 * W // 4] = 1/2
+                        n[H // 2, 3 * W // 4] = 1/2
+                    elif spatial_relationship == 1:
+                        n[H // 2, 1 * W // 4] = 1/2
+                        n[H // 2, 3 * W // 4] = 1/2
+                    elif spatial_relationship == 2:
+                        n[1 * H // 4,  W // 2] = 1/2
+                        n[3 * H // 4,  W // 2] = 1/2
+                    elif spatial_relationship == 3:
+                        n[1 * H // 4,  W // 2] = 1/2
+                        n[3 * H // 4,  W // 2] = 1/2
+                    attention_weight_obj_a = torch.from_numpy(
+                        scipy.ndimage.gaussian_filter(n, sigma=H / 8)).cuda().reshape(-1)
+                    # print((attention_weight_obj_a / attention_weight_obj_a.sum()).shape)
+                    attention_weight_obj_a_normalized = torch.tile(
+                        (attention_weight_obj_a / attention_weight_obj_a.sum()).unsqueeze(0).unsqueeze(-1), (b // 2, 1, j-padding_token_start))
+                    # print(attention_weight_obj_a_normalized.shape)
+                    word_sum = torch.tile(attention_probs[per_image_size:, :, padding_token_start:].sum(dim=-2).unsqueeze(-2), (1, i, 1))
+                    attention_probs[per_image_size:, :, padding_token_start:] = (1-strength)*attention_probs[per_image_size:, :, padding_token_start:] + strength * attention_weight_obj_a_normalized * word_sum
+                    ### start token
+                    n = np.zeros((H, W))
+                    # print("use start token", relationship)
+                    if spatial_relationship == 0:
+                        n[H // 2, 1 * W // 4] = 1/2
+                        n[H // 2, 3 * W // 4] = 1/2
+                    elif spatial_relationship == 1:
+                        n[H // 2, 1 * W // 4] = 1/2
+                        n[H // 2, 3 * W // 4] = 1/2
+                    elif spatial_relationship == 2:
+                        n[1 * H // 4, W // 2] = 1/2
+                        n[3 * H // 4, W // 2] = 1/2
+                    elif spatial_relationship == 3:
+                        n[1 * H // 4, W // 2] = 1/2
+                        n[3 * H // 4, W // 2] = 1/2
+                    attention_weight_obj_a = torch.from_numpy(
+                        scipy.ndimage.gaussian_filter(n, sigma=H / 8)).cuda().reshape(-1)
+                    attention_weight_obj_a = 1 - attention_weight_obj_a
+                    # print((attention_weight_obj_a / attention_weight_obj_a.sum()).shape)
+                    attention_weight_obj_a_normalized = torch.tile(
+                        (attention_weight_obj_a / attention_weight_obj_a.sum()).unsqueeze(0),
+                        (b // 2, 1))
+                    # print(attention_weight_obj_a_normalized.shape)
+                    word_sum = attention_probs[per_image_size:, :, 0].sum(dim=-1)
+                    # print("before the adding", attention_probs[per_image_size:, :, 0].sum(dim=-1)[0])
+                    # print("adding noise" , (attention_weight_obj_a_normalized * word_sum.unsqueeze(-1)).sum(dim=-1)[0])
+                    # print("before the adding" ,attention_probs[per_image_size:, :, 0].sum(dim=-1)[0], )
+                    attention_probs[per_image_size:, :, 0] = (1 - strength) * attention_probs[per_image_size:, :, 0] + strength * attention_weight_obj_a_normalized * word_sum.unsqueeze(-1)
+                    # print("after the adding", attention_probs[per_image_size:, :, 0].sum(dim=-1)[0])
+                ### end
+                ### one token
+                #
+                # n = np.zeros((H, W))
+                # n[3 * H // 4, 1 * W // 4] = 1
+                # obj_a_ids = 5
+                # # obj_b_ids = obj_ids[1]
+                # attention_weight_obj_a = torch.from_numpy(
+                #     scipy.ndimage.gaussian_filter(n, sigma=H / 8)).cuda().reshape(-1)
+                # # print((attention_weight_obj_a / attention_weight_obj_a.sum()).shape)
+                # attention_weight_obj_a_normalized = torch.tile(
+                #     (attention_weight_obj_a / attention_weight_obj_a.sum()).unsqueeze(0),
+                #     (b // 2, 1))
+                # # print(attention_weight_obj_a_normalized.shape)
+                # word_sum = attention_probs[per_image_size:, :, obj_a_ids].sum(dim=-1)
+                # print(word_sum.shape, attention_weight_obj_a_normalized.shape)
+                #
+                # attention_probs[per_image_size:, :, obj_a_ids] = (1 - strength) * attention_probs[per_image_size:, :,
+                #                                                                   obj_a_ids] + strength * attention_weight_obj_a_normalized * word_sum.unsqueeze(-1)
+                ###### Normal one
+                obj_a_ids = obj_ids[0]
+                obj_b_ids = obj_ids[1]
+                # obj_a_ids = [2]
+                # obj_b_ids = [8]
+                strength = relationship['strength']
+                spatial_relationship = relationship['spatial_relationship']
+                # print("use_normal_one")
+                for obj_a_id in obj_a_ids:
+                    n = np.zeros((H, W))
+                    if spatial_relationship == 0:
+                        n[H // 2, 1 * W // 4] = 1
+                    elif spatial_relationship == 1:
+                        n[H // 2, 3 * W // 4] = 1
+                    elif spatial_relationship == 2:
+                        n[1 * H // 4,  W // 2] = 1
+                    elif spatial_relationship == 3:
+                        n[3 * H // 4,  W // 2] = 1
+                    attention_weight_obj_a = torch.from_numpy(
+                        scipy.ndimage.gaussian_filter(n, sigma=H / 8)).cuda().reshape(-1)
+                    attention_weight_obj_a_normalized = torch.tile(
+                        (attention_weight_obj_a / attention_weight_obj_a.sum()).unsqueeze(0), (b // 2, 1))
+                    word_sum = attention_probs[per_image_size:, :, obj_a_id].sum(dim=-1)
+                    attention_probs[per_image_size:, :, obj_a_id] = (1-strength)*attention_probs[per_image_size:, :, obj_a_id] + strength * attention_weight_obj_a_normalized * word_sum.unsqueeze(-1)
+                for obj_id in obj_b_ids:
+                    n = np.zeros((H, W))
+                    if spatial_relationship == 0:
+                        n[H // 2, 3 * W // 4] = 1
+                    elif spatial_relationship == 1:
+                        n[H // 2, 1 * W // 4] = 1
+                    elif spatial_relationship == 2:
+                        n[3 * H // 4, W // 2] = 1
+                    elif spatial_relationship == 3:
+                        n[1 * H // 4, W // 2] = 1
+                    attention_weight_obj = torch.from_numpy(
+                        scipy.ndimage.gaussian_filter(n, sigma=H / 8)).cuda().reshape(-1)
+                    attention_weight_obj_normalized = torch.tile(
+                        (attention_weight_obj / attention_weight_obj.sum()).unsqueeze(0), (b // 2, 1))
+                    word_sum = attention_probs[per_image_size:, :, obj_id].sum(dim=-1)
+                    attention_probs[per_image_size:, :, obj_id] = (1-strength) * attention_probs[per_image_size:, :,obj_id] + strength * attention_weight_obj_normalized * word_sum.unsqueeze(-1)
+                # n = np.zeros((H, W))
+                # if relationship =
+                # n[H//2, 1*W//4] = 1
+                # attention_weight_dog = torch.from_numpy(scipy.ndimage.gaussian_filter(n, sigma=H/8)).cuda().reshape(-1)
+                # attention_weight_dog_normalized = torch.tile((attention_weight_dog/attention_weight_dog.sum()).unsqueeze(0),(b//2, 1))
+                # word_sum = attention_probs[per_image_size:, :, 8].sum(dim=-1)
+                # attention_probs[per_image_size:, :, 8] = 0 * attention_probs[per_image_size:, :, 1] +  1 * attention_weight_dog_normalized * word_sum.unsqueeze(-1)
+                # attention_weight_motorbike = torch.from_numpy(scipy.ndimage.gaussian_filter(n, sigma=H/12)).cuda().reshape(-1)
+                # attention_weight_motorbike_normalized = torch.tile(attention_weight_motorbike/attention_weight_motorbike.sum().unsqueeze(0), (b//2, 1))
+                #
+                #
+                # # print('attention_probs', attention_probs[per_image_size:, :, 3].sum(dim=-1))
+                # print(attention_weight_motorbike_normalized.shape, attention_probs[per_image_size:, :, 3].sum(dim=-1))
+                # attention_probs[per_image_size:, :, 3] = 0.9 * attention_weight_motorbike_normalized * attention_probs[per_image_size:, :, 3].sum(dim=-1).unsqueeze(-1) + 0.1 * attention_probs[per_image_size:, :, 3]
+                # attention_weight_all = attention_weight_motorbike + attention_weight_cat
+                # attention_weight_all_normalized = attention_weight_all/attention_weight_all.sum()
+                # attention_probs[per_image_size:, :, 4:] = attention_weight_all_normalized.unsqueeze(-1)
+                # b, i, j = attention_probs.shape
+                #
+                # H = W = int(math.sqrt(i))
+                # attention_probs_reshape = attention_probs.permute(0, 2, 1).reshape(b, j, H, W)
+                # if attn_mask is None:
+                #     attn_mask = torch.zeros(size=attention_probs_reshape.shape).cuda()
+                #     attn_mask[:, :, H//2:, W//2:] = 1
+                #     # print(attention_probs_reshape.is_cuda, attention_probs_reshape.get_device())
+                #     # attn_mask.cuda()
+                #     attention_probs_reshape = attention_probs_reshape * attn_mask
+                # else:
+                #     attn_mask.cuda()
+                #     attention_probs_reshape = attention_probs_reshape * attn_mask
+                # attention_probs_reshape = attention_probs_reshape.reshape(b, j, i)
+                # attention_probs_reshape = attention_probs_reshape/(attention_probs_reshape.sum(dim=-1).unsqueeze(-1))
+                # attention_probs[per_image_size:] = attention_probs_reshape.permute(0, 2, 1)[per_image_size:]
+            # if attn_shift:
+            #     b, i, j = attention_probs.shape
+            #     H = W = int(math.sqrt(i))
+            #     attention_map_hw = attention_probs.permute(0, 2, 1).reshape(b, j, H, W)
+            #     # print("attention_map_hw", attention_map_hw.shape)
+            #     attention_map_hw_pad = F.pad(attention_map_hw, (W//2, W//2), "constant", 0)
+            #     # print("attention_map_hw_pad", attention_map_hw_pad.shape)
+            #     attention_map_hw_pad = torch.roll(attention_map_hw_pad, W//4, -1)
+            #     attention_map_hw_pad_crop = attention_map_hw_pad[:, :, :, W//2:W//2 + W].reshape(b, j, i)
+            #     attention_map_flatten_pad_crop_sum = attention_map_hw_pad_crop.sum(dim=-1)
+            #     attention_map_hw_pad_crop = (attention_map_hw_pad_crop/attention_map_flatten_pad_crop_sum.unsqueeze(-1)).permute(0, 2, 1)
+            #     # attention_map_hw_pad_crop = attention_map_hw_pad_crop.reshape(b, j, i).permute(0, 2, 1)
+            #     # attention_map_hw_pad_crop_sum = attention_map_hw_pad_crop.sum(dim=-2)
+            #     # print(attention_map_hw_pad_crop.min())
+            #     # print("attention_map_hw_pad_crop", attention_map_hw_pad_crop.shape)
+            #     # attention_probs[per_image_size:, :, (2, 6)] = attention_map_hw_pad_crop.softmax(dim=-1)[per_image_size:, :, (2, 6)]
+            #     # attention_probs[per_image_size:, :, (2, 6)] = attention_map_hw_pad_crop[per_image_size:, :, (2, 6)]
+            #     attention_probs[per_image_size:] = attention_map_hw_pad_crop[per_image_size:]
+            # if attn_blob:
+            # n = np.zeros((21, 21))
+            # n[10, 10] = 1
+            # k = scipy.ndimage.gaussian_filter(n, sigma=3)
+            # else:
+            #     # print(attention_probs.shape)
+            #     hidden_states = torch.matmul(attention_probs, value)
+            hidden_states = torch.matmul(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states, attention_probs
+    def _sliced_attention(self, query, key, value, sequence_length, dim):
+        batch_size_attention = query.shape[0]
+        hidden_states = torch.zeros(
+            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+        )
+        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+        for i in range(hidden_states.shape[0] // slice_size):
+            start_idx = i * slice_size
+            end_idx = (i + 1) * slice_size
+            if query.device.type == "mps":
+                # Better performance on mps (~20-25%)
+                attn_slice = (
+                    torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
+                    * self.scale
+                )
+            else:
+                attn_slice = (
+                    torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
+                )  # TODO: use baddbmm for better performance
+            attn_slice = attn_slice.softmax(dim=-1)
+            if query.device.type == "mps":
+                attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
+            else:
+                attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value):
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "geglu":
+            geglu = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            geglu = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(geglu)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x

my_model/unet_2d_blocks.py ADDED Viewed

	@@ -0,0 +1,1612 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from torch import nn
+from .attention import AttentionBlock, Transformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
+def get_down_block(
+        down_block_type,
+        num_layers,
+        in_channels,
+        out_channels,
+        temb_channels,
+        add_downsample,
+        resnet_eps,
+        resnet_act_fn,
+        attn_num_head_channels,
+        resnet_groups=None,
+        cross_attention_dim=None,
+        downsample_padding=None,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+        up_block_type,
+        num_layers,
+        in_channels,
+        out_channels,
+        prev_output_channel,
+        temb_channels,
+        add_upsample,
+        resnet_eps,
+        resnet_act_fn,
+        attn_num_head_channels,
+        resnet_groups=None,
+        cross_attention_dim=None,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            attention_type="default",
+            output_scale_factor=1.0,
+            **kwargs,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.attention_type == "default":
+                hidden_states = attn(hidden_states)
+            else:
+                hidden_states = attn(hidden_states, encoder_states)
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            attention_type="default",
+            output_scale_factor=1.0,
+            cross_attention_dim=1280,
+            **kwargs,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, index, temb=None, encoder_hidden_states=None, attn_map=None, attn_shift=False, attn_map_step=20,  obj_ids=None, relationship=None):
+        device = hidden_states.get_device() if hidden_states.is_cuda else 'cpu'
+        hidden_states = self.resnets[0](hidden_states, temb)
+        mid_attn = []
+        mid_value = []
+        for layer_idx, (attn, resnet) in enumerate(zip(self.attentions, self.resnets[1:])):
+            hidden_states, cross_attn_prob, save_value = attn(hidden_states, encoder_hidden_states, attn_map=attn_map[layer_idx].chunk(2)[1].to(device) if index < attn_map_step else None, attn_shift=attn_shift, obj_ids=obj_ids, relationship=relationship)
+            hidden_states = hidden_states.sample
+            hidden_states = resnet(hidden_states, temb)
+            mid_attn.append(cross_attn_prob)
+            mid_value.append(save_value)
+        return hidden_states, mid_attn, mid_value
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            attention_type="default",
+            output_scale_factor=1.0,
+            downsample_padding=1,
+            add_downsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            cross_attention_dim=1280,
+            attention_type="default",
+            output_scale_factor=1.0,
+            downsample_padding=1,
+            add_downsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attn_map=None, attn_shift=False,  obj_ids=None, relationship=None):
+        output_states = ()
+        cross_attn_prob_list = []
+        for layer_idx, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                tmp_hidden_states, cross_attn_prob, save_value = attn(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_map=attn_map[layer_idx] if attn_map is not None else None, attn_shift=attn_shift,  obj_ids=obj_ids, relationship=relationship)
+                hidden_states = tmp_hidden_states.sample
+                # hidden_states, cross_attn_prob = attn(hidden_states, encoder_hidden_states=encoder_hidden_states)
+                # hidden_states = hidden_states.sample
+            output_states += (hidden_states,)
+            cross_attn_prob_list.append(cross_attn_prob)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states, cross_attn_prob_list, save_value
+class DownBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_downsample=True,
+            downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_downsample=True,
+            downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            output_scale_factor=1.0,
+            add_downsample=True,
+            downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            attention_type="default",
+            output_scale_factor=np.sqrt(2.0),
+            downsample_padding=1,
+            add_downsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_pre_norm: bool = True,
+            output_scale_factor=np.sqrt(2.0),
+            add_downsample=True,
+            downsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            prev_output_channel: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attention_type="default",
+            attn_num_head_channels=1,
+            output_scale_factor=1.0,
+            add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            prev_output_channel: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            cross_attention_dim=1280,
+            attention_type="default",
+            output_scale_factor=1.0,
+            add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(
+            self,
+            hidden_states,
+            res_hidden_states_tuple,
+            temb=None,
+            encoder_hidden_states=None,
+            upsample_size=None,
+            attn_map=None,
+            attn_shift=False,
+            obj_ids=None,
+            relationship=None
+    ):
+        cross_attn_prob_list = list()
+        for layer_idx, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                tmp_hidden_states, cross_attn_prob, save_value = attn(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_map=attn_map[layer_idx] if attn_map is not None else None, attn_shift=attn_shift,  obj_ids=obj_ids, relationship=relationship)
+                hidden_states = tmp_hidden_states.sample
+            cross_attn_prob_list.append(cross_attn_prob)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states, cross_attn_prob_list, save_value
+class UpBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            prev_output_channel: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            output_scale_factor=1.0,
+            add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            prev_output_channel: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            attention_type="default",
+            output_scale_factor=np.sqrt(2.0),
+            upsample_padding=1,
+            add_upsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions.append(
+            AttentionBlock(
+                out_channels,
+                num_head_channels=attn_num_head_channels,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+            )
+        )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        hidden_states = self.attentions[0](hidden_states)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            prev_output_channel: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_pre_norm: bool = True,
+            output_scale_factor=np.sqrt(2.0),
+            add_upsample=True,
+            upsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample

my_model/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pdb
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+            self,
+            sample_size: Optional[int] = None,
+            in_channels: int = 4,
+            out_channels: int = 4,
+            center_input_sample: bool = False,
+            flip_sin_to_cos: bool = True,
+            freq_shift: int = 0,
+            down_block_types: Tuple[str] = (
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "DownBlock2D",
+            ),
+            up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+            layers_per_block: int = 2,
+            downsample_padding: int = 1,
+            mid_block_scale_factor: float = 1,
+            act_fn: str = "silu",
+            norm_num_groups: int = 32,
+            norm_eps: float = 1e-5,
+            cross_attention_dim: int = 1280,
+            attention_head_dim: int = 8,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+            )
+        if slice_size is not None and slice_size > self.config.attention_head_dim:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+            )
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+        self.mid_block.set_attention_slice(slice_size)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+            self,
+            sample: torch.FloatTensor,
+            timestep: Union[torch.Tensor, float, int],
+            index,
+            encoder_hidden_states: torch.Tensor,
+            attn_map: Union[torch.Tensor],
+            cfg,
+            return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs_coarse tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        device = sample.get_device() if sample.is_cuda else 'cpu'
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        # attn_map_uncond, attn_map = attn_map_integrated.chunk(2)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # print('index', index)
+        # 3. down
+        attn_down = []
+        value_down = []
+        down_block_res_samples = (sample,)
+        # print(len(attn_map['attn_down']), len(attn_map['attn_down'][0]), len(attn_map['attn_down'][0][0]), attn_map['attn_down'][0][0][0].shape)
+        for block_idx, downsample_block in enumerate(self.down_blocks):
+            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+                if block_idx < 5:
+                    # pdb.set_trace()
+                    sample, res_samples, cross_atten_prob, save_value = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        attn_map=attn_map['attn_down'][index][block_idx] if index < cfg.training.down_attn_map else None,
+                        attn_shift=True if index < cfg.training.down_attn_shift else False,
+                        obj_ids=cfg.inference.obj_ids if 'obj_ids' in cfg.inference else None,
+                        relationship=cfg.inference.relationship if 'relationship' in cfg.inference else None
+                    )
+                else:
+                    sample, res_samples, cross_atten_prob, save_value = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        attn_map=None
+                    )
+                attn_down.append(cross_atten_prob)
+                value_down.append(save_value)
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample, attn_mid, value_mid = self.mid_block(sample, index, emb, encoder_hidden_states=encoder_hidden_states, attn_map=attn_map['attn_mid'][index] if index < cfg.training.mid_attn_map else None,
+                                                     attn_shift=True if index < cfg.training.mid_attn_shift else False, attn_map_step=cfg.training.mid_attn_map,
+                                                     obj_ids=cfg.inference.obj_ids if 'obj_ids' in cfg.inference else None,
+                                                     relationship=cfg.inference.relationship if 'relationship' in cfg.inference else None
+                                                     )
+        # 5. up
+        attn_up = []
+        value_up = []
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+                sample, cross_atten_prob, save_value = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    # attn_map=None,
+                    attn_shift=True if index < cfg.training.up_attn_shift else False,
+                    attn_map=attn_map['attn_up'][index][i-1] if index < cfg.training.up_attn_map else None,
+                    obj_ids=cfg.inference.obj_ids if 'obj_ids' in cfg.inference else None,
+                    relationship=cfg.inference.relationship if 'relationship' in cfg.inference else None
+                )
+                attn_up.append(cross_atten_prob)
+                value_mid.append(save_value)
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample), attn_up, attn_mid, attn_down, value_up, value_mid, value_down

utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import math
+def compute_ca_loss(attn_maps_mid, attn_maps_up, bboxes, object_positions):
+    loss = 0
+    object_number = len(bboxes)
+    if object_number == 0:
+        return torch.tensor(0).float().cuda()
+    for attn_map_integrated in attn_maps_mid:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        #
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        for obj_idx in range(object_number):
+            obj_loss = 0
+            mask = torch.zeros(size=(H, W)).cuda()
+            for obj_box in bboxes[obj_idx]:
+                x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
+                    int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                mask[y_min: y_max, x_min: x_max] = 1
+            for obj_position in object_positions[obj_idx]:
+                ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+                activation_value = (ca_map_obj * mask).reshape(b, -1).sum(dim=-1)/ca_map_obj.reshape(b, -1).sum(dim=-1)
+                obj_loss += torch.mean((1 - activation_value) ** 2)
+            loss += (obj_loss/len(object_positions[obj_idx]))
+        # compute loss on padding tokens
+        # activation_value = torch.zeros(size=(b, )).cuda()
+        # for obj_idx in range(object_number):
+        #     bbox = bboxes[obj_idx]
+        #     ca_map_obj = attn_map[:, :, padding_start:].reshape(b, H, W, -1)
+        #     activation_value += ca_map_obj[:, int(bbox[0] * H): int(bbox[1] * H),
+        #                        int(bbox[2] * W): int(bbox[3] * W), :].reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(dim=-1)
+        #
+        # loss += torch.mean((1 - activation_value) ** 2)
+    for attn_map_integrated in attn_maps_up[0]:
+        attn_map = attn_map_integrated.chunk(2)[1]
+        #
+        b, i, j = attn_map.shape
+        H = W = int(math.sqrt(i))
+        for obj_idx in range(object_number):
+            obj_loss = 0
+            mask = torch.zeros(size=(H, W)).cuda()
+            for obj_box in bboxes[obj_idx]:
+                x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
+                    int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                mask[y_min: y_max, x_min: x_max] = 1
+            for obj_position in object_positions[obj_idx]:
+                ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+                # ca_map_obj = attn_map[:, :, object_positions[obj_position]].reshape(b, H, W)
+                activation_value = (ca_map_obj * mask).reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(
+                    dim=-1)
+                obj_loss += torch.mean((1 - activation_value) ** 2)
+            loss += (obj_loss / len(object_positions[obj_idx]))
+        # compute loss on padding tokens
+        # activation_value = torch.zeros(size=(b, )).cuda()
+        # for obj_idx in range(object_number):
+        #     bbox = bboxes[obj_idx]
+        #     ca_map_obj = attn_map[:, :,padding_start:].reshape(b, H, W, -1)
+        #     activation_value += ca_map_obj[:, int(bbox[0] * H): int(bbox[1] * H),
+        #                        int(bbox[2] * W): int(bbox[3] * W), :].reshape(b, -1).sum(dim=-1) / ca_map_obj.reshape(b, -1).sum(dim=-1)
+        #
+        # loss += torch.mean((1 - activation_value) ** 2)
+    loss = loss / (object_number * (len(attn_maps_up[0]) + len(attn_maps_mid)))
+    return loss