Spaces:

PAIR
/

PAIR-Diffusion

Runtime error

App Files Files Community

vidit98 commited on Jan 7, 2024

Commit

0d2dd65

1 Parent(s): 9541d96

update code

Browse files

Files changed (39) hide show

.gitattributes +34 -0
app.py +393 -338
assets/GIF.gif +0 -0
assets/Teaser_Small.png +3 -0
assets/examples/Lancia.webp +3 -0
assets/examples/car.jpeg +3 -0
assets/examples/car1.webp +3 -0
assets/examples/carpet2.webp +3 -0
assets/examples/chair.jpeg +3 -0
assets/examples/chair1.jpeg +3 -0
assets/examples/dog.jpeg +3 -0
assets/examples/door.jpeg +3 -0
assets/examples/door2.jpeg +3 -0
assets/examples/grasslands-national-park.jpeg +3 -0
assets/examples/house.jpeg +3 -0
assets/examples/house2.jpeg +3 -0
assets/examples/ian.jpeg +3 -0
assets/examples/park.webp +3 -0
assets/examples/ran.webp +3 -0
assets/hulk.jpeg +0 -0
assets/ironman.webp +0 -0
assets/lava.jpg +0 -0
assets/ski.jpg +0 -0
assets/truck.png +0 -0
assets/truck2.jpeg +0 -0
cldm/appearance_networks.py +75 -0
cldm/cldm.py +115 -118
cldm/controlnet.py +306 -0
cldm/ddim_hacked.py +2 -3
cldm/logger.py +10 -10
configs/{sap_fixed_hintnet_v15.yaml → pair_diff.yaml} +20 -7
ldm/ldm/util.py +197 -0
ldm/models/diffusion/ddim.py +15 -4
ldm/modules/attention.py +61 -15
ldm/modules/diffusionmodules/openaimodel.py +16 -4
ldm/modules/diffusionmodules/util.py +2 -1
ldm/modules/encoders/modules.py +3 -3
pair_diff_demo.py +516 -0
requirements.txt +2 -1

.gitattributes CHANGED Viewed

@@ -32,3 +32,37 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/examples/ian.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/resized_anm_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/anm_8.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/house.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/door2.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/door.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/frn_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/park.webp filter=lfs diff=lfs merge=lfs -text
+assets/examples/car1.webp filter=lfs diff=lfs merge=lfs -text
+assets/examples/car.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/house2.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/Lancia.webp filter=lfs diff=lfs merge=lfs -text
+assets/examples/obj_11.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/resized_anm_8.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/resized_frn_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/resized_obj_11.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/dog.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/grasslands-national-park.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/resized_obj_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/chair1.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/chair.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/examples/obj_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/ran.webp filter=lfs diff=lfs merge=lfs -text
+assets/examples/anm_38.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/carpet2.webp filter=lfs diff=lfs merge=lfs -text
+assets/ironman.webp filter=lfs diff=lfs merge=lfs -text
+assets/truck2.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/truck.png filter=lfs diff=lfs merge=lfs -text
+assets/ski.jpg filter=lfs diff=lfs merge=lfs -text
+assets/Teaser_Small.png filter=lfs diff=lfs merge=lfs -text
+assets/examples filter=lfs diff=lfs merge=lfs -text
+assets/GIF.gif filter=lfs diff=lfs merge=lfs -text
+assets/hulk.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/lava.jpg filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,429 +1,484 @@
-import einops
 import gradio as gr
-import numpy as np
-import torch
-import random
-import os
-import subprocess
-import shlex
-from huggingface_hub import hf_hub_url, hf_hub_download
-from share import *
-from pytorch_lightning import seed_everything
-from annotator.util import resize_image, HWC3
-from annotator.OneFormer import OneformerSegmenter
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSamplerSpaCFG
-from ldm.models.autoencoder import DiagonalGaussianDistribution
-urls = {
-    'shi-labs/oneformer_coco_swin_large': ['150_16_swin_l_oneformer_coco_100ep.pth'],
-    'PAIR/PAIR-diffusion-sdv15-coco-finetune': ['pair_diffusion_epoch62.ckpt']
-}
-WTS_DICT = {
-}
-if os.path.exists('checkpoints') == False:
-    os.mkdir('checkpoints')
-for repo in urls:
-    files = urls[repo]
-    for file in files:
-        url = hf_hub_url(repo, file)
-        name_ckp = url.split('/')[-1]
-        WTS_DICT[repo] = hf_hub_download(repo_id=repo, filename=file, token=os.environ.get("ACCESS_TOKEN"))
-print(WTS_DICT)
-apply_segmentor = OneformerSegmenter(WTS_DICT['shi-labs/oneformer_coco_swin_large'])
-model = create_model('./configs/sap_fixed_hintnet_v15.yaml').cpu()
-model.load_state_dict(load_state_dict(WTS_DICT['PAIR/PAIR-diffusion-sdv15-coco-finetune'], location='cuda'))
-model = model.cuda()
-ddim_sampler = DDIMSamplerSpaCFG(model)
-_COLORS = []
-save_memory = False
-def gen_color():
-    color = tuple(np.round(np.random.choice(range(256), size=3), 3))
-    if color not in _COLORS and np.mean(color) != 0.0:
-        _COLORS.append(color)
-    else:
-        gen_color()
-for _ in range(300):
-    gen_color()
-class ImageComp:
-    def __init__(self, edit_operation):
-        self.input_img = None
-        self.input_pmask = None
-        self.input_segmask = None
-        self.ref_img = None
-        self.ref_pmask = None
-        self.ref_segmask = None
-        self.H = None
-        self.W = None
-        self.baseoutput = None
-        self.kernel = np.ones((5, 5), np.uint8)
-        self.edit_operation = edit_operation
-    def init_input_canvas(self, img):
-        img =  HWC3(img)
-        img = resize_image(img, 512)
-        detected_mask = apply_segmentor(img, 'panoptic')[0]
-        detected_seg = apply_segmentor(img, 'semantic')
-        self.input_img = img
-        self.input_pmask = detected_mask
-        self.input_segmask = detected_seg
-        self.H = img.shape[0]
-        self.W = img.shape[1]
-        detected_mask = detected_mask.cpu().numpy()
-        uni = np.unique(detected_mask)
-        color_mask = np.zeros((detected_mask.shape[0], detected_mask.shape[1], 3))
-        for i in uni:
-            color_mask[detected_mask == i] = _COLORS[i]
-        output = color_mask*0.8 + img * 0.2
-        self.baseoutput = output.astype(np.uint8)
-        return self.baseoutput
-    def init_ref_canvas(self, img):
-        img =  HWC3(img)
-        img = resize_image(img, 512)
-        detected_mask = apply_segmentor(img, 'panoptic')[0]
-        detected_seg = apply_segmentor(img, 'semantic')
-        self.ref_img = img
-        self.ref_pmask = detected_mask
-        self.ref_segmask = detected_seg
-        detected_mask = detected_mask.cpu().numpy()
-        uni = np.unique(detected_mask)
-        color_mask = np.zeros((detected_mask.shape[0], detected_mask.shape[1], 3))
-        for i in uni:
-            color_mask[detected_mask == i] = _COLORS[i]
-        output = color_mask*0.8 + img * 0.2
-        self.baseoutput = output.astype(np.uint8)
-        return self.baseoutput
-    def _process_mask(self, mask, panoptic_mask, segmask):
-        panoptic_mask_ = panoptic_mask + 1
-        mask_ = resize_image(mask['mask'][:, :, 0], min(panoptic_mask.shape))
-        mask_ = torch.tensor(mask_)
-        maski = torch.zeros_like(mask_).cuda()
-        maski[mask_ > 127] = 1
-        mask = maski * panoptic_mask_
-        unique_ids, counts = torch.unique(mask, return_counts=True)
-        mask_id = unique_ids[torch.argmax(counts[1:]) + 1]
-        final_mask = torch.zeros(mask.shape).cuda()
-        final_mask[panoptic_mask_ == mask_id] = 1
-        obj_class = maski * (segmask + 1)
-        unique_ids, counts = torch.unique(obj_class, return_counts=True)
-        obj_class = unique_ids[torch.argmax(counts[1:]) + 1] - 1
-        return final_mask, obj_class
-    def _edit_app(self, input_mask, ref_mask, whole_ref):
-        input_pmask = self.input_pmask
-        input_segmask = self.input_segmask
-        if whole_ref:
-            reference_mask = torch.ones(self.ref_pmask.shape).cuda()
-        else:
-            reference_mask, _ = self._process_mask(ref_mask, self.ref_pmask, self.ref_segmask)
-        edit_mask, _ = self._process_mask(input_mask, self.input_pmask, self.input_segmask)
-        ma = torch.max(input_pmask)
-        input_pmask[edit_mask == 1] = ma + 1
-        return reference_mask, input_pmask, input_segmask, edit_mask, ma
-    def _edit(self, input_mask, ref_mask, whole_ref=False, inter=1):
-        input_img = (self.input_img/127.5 - 1)
-        input_img =  torch.from_numpy(input_img.astype(np.float32)).cuda().unsqueeze(0).permute(0,3,1,2)
-        reference_img = (self.ref_img/127.5 - 1)
-        reference_img =  torch.from_numpy(reference_img.astype(np.float32)).cuda().unsqueeze(0).permute(0,3,1,2)
-        reference_mask, input_pmask, input_segmask, region_mask, ma = self._edit_app(input_mask, ref_mask, whole_ref)
-        input_pmask = input_pmask.float().cuda().unsqueeze(0).unsqueeze(1)
-        _, mean_feat_inpt, one_hot_inpt, empty_mask_flag_inpt = model.get_appearance(input_img, input_pmask, return_all=True)
-        reference_mask = reference_mask.float().cuda().unsqueeze(0).unsqueeze(1)
-        _, mean_feat_ref, _, _ = model.get_appearance(reference_img, reference_mask, return_all=True)
-        if mean_feat_ref.shape[1] > 1:
-            mean_feat_inpt[:, ma + 1] = (1 - inter) * mean_feat_inpt[:, ma + 1] + inter*mean_feat_ref[:, 1]
-        splatted_feat = torch.einsum('nmc, nmhw->nchw', mean_feat_inpt, one_hot_inpt)
-        appearance = torch.nn.functional.normalize(splatted_feat) #l2 normaliz
-        input_segmask =  ((input_segmask+1)/ 127.5 - 1.0).cuda().unsqueeze(0).unsqueeze(1)
-        structure = torch.nn.functional.interpolate(input_segmask, (self.H, self.W))
-        appearance = torch.nn.functional.interpolate(appearance, (self.H, self.W))
-        return structure, appearance, region_mask, input_img
-    def process(self, input_mask, ref_mask, prompt, a_prompt, n_prompt,
-                num_samples, ddim_steps, guess_mode, strength,
-                scale_s, scale_f, scale_t, seed, eta, masking=True,whole_ref=False,inter=1):
-        structure, appearance, mask, img = self._edit(input_mask, ref_mask,
-                                                     whole_ref=whole_ref, inter=inter)
-        null_structure = torch.zeros(structure.shape).cuda() - 1
-        null_appearance = torch.zeros(appearance.shape).cuda()
-        null_control = torch.cat([null_structure, null_appearance], dim=1)
-        structure_control = torch.cat([structure, null_appearance], dim=1)
-        full_control = torch.cat([structure, appearance], dim=1)
-        null_control = torch.cat([null_control for _ in range(num_samples)], dim=0)
-        structure_control = torch.cat([structure_control for _ in range(num_samples)], dim=0)
-        full_control = torch.cat([full_control for _ in range(num_samples)], dim=0)
-        #Masking for local edit
-        if not masking:
-            mask, x0 = None, None
-        else:
-            x0 = model.encode_first_stage(img)
-            x0 = x0.sample() if isinstance(x0, DiagonalGaussianDistribution) else x0 # todo: check if we can set random number
-            x0 = x0 * model.scale_factor
-            mask = 1 - torch.tensor(mask).unsqueeze(0).unsqueeze(1).cuda()
-            mask = torch.nn.functional.interpolate(mask, x0.shape[2:]).float()
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        scale = [scale_s, scale_f, scale_t]
-        print(scale)
-        if save_memory:
-            model.low_vram_shift(is_diffusing=False)
-        # uc_cross = model.get_unconditional_conditioning(num_samples)
-        uc_cross = model.get_learned_conditioning([n_prompt] * num_samples)
-        cond = {"c_concat": [full_control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt]  * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [null_control], "c_crossattn": [uc_cross]}
-        un_cond_struct = {"c_concat": None if guess_mode else [structure_control], "c_crossattn": [uc_cross]}
-        un_cond_struct_app = {"c_concat": None if guess_mode else [full_control], "c_crossattn": [uc_cross]}
-        shape = (4, self.H // 8, self.W // 8)
-        if save_memory:
-            model.low_vram_shift(is_diffusing=True)
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, _ = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale, mask=mask, x0=x0,
-                                                     unconditional_conditioning=[un_cond, un_cond_struct, un_cond_struct_app ])
-        if save_memory:
-            model.low_vram_shift(is_diffusing=False)
-        x_samples = (model.decode_first_stage(samples) + 1) * 127.5
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c')).cpu().numpy().clip(0, 255).astype(np.uint8)
-        results = [x_samples[i] for i in range(num_samples)]
-        return [] + results
-def init_input_canvas_wrapper(obj, *args):
-    return obj.init_input_canvas(*args)
-def init_ref_canvas_wrapper(obj, *args):
-    return obj.init_ref_canvas(*args)
-def process_wrapper(obj, *args):
-    return obj.process(*args)
-css = """
-     h1 {
-  text-align: center;
-}
-.container {
-  display: flex;
-  justify-content: space-between
-}
-img {
-  max-width: 100%
-  padding-right: 100px;
-}
-.image {
-  flex-basis: 40%
-}
-.text {
-  font-size: 15px;
-  padding-right: 20px;
-  padding-left: 0px;
-}
-"""
-def create_app_demo():
     with gr.Row():
-        gr.Markdown("## Object Level Appearance Editing")
     with gr.Row():
         gr.HTML(
             """
-            <div class="container">
-                <div class="text">
-                    <h4> Instructions </h4>
-                    <ol>
-                        <li>Upload an Input Image.</li>
-                        <li>Mark one of segmented objects in the <i>Select Object to Edit</i> tab.</li>
-                        <li>Upload an Reference Image.</li>
-                        <li>Mark  one of segmented objects in the <i>Select Reference Object</i> tab, for the reference appearance.</li>
-                        <li>Enter a prompt and press <i>Run</i> button. (A very simple would also work) </li>
-                    </ol>
-                </div>
-                <div class="image">
-                    <img src="file/assets/GIF.gif" width="400"">
-                </div>
-            </div>
-            """)
     with gr.Column():
         with gr.Row():
             img_edit = gr.State(ImageComp('edit_app'))
             with gr.Column():
-                btn1 = gr.Button("Input Image")
                 input_image = gr.Image(source='upload', label='Input Image', type="numpy",)
             with gr.Column():
-                btn2 = gr.Button("Select Object to Edit")
-                input_mask = gr.Image(source="upload",  label='Select Object in Input Image', type="numpy", tool="sketch")
-            input_image.change(fn=init_input_canvas_wrapper, inputs=[img_edit, input_image], outputs=[input_mask],  queue=False)
-        # with gr.Row():
-            with gr.Column():
-                btn3 = gr.Button("Reference Image")
-                ref_img = gr.Image(source='upload', label='Reference Image', type="numpy")
-            with gr.Column():
-                btn4 = gr.Button("Select Reference Object")
-                reference_mask = gr.Image(source="upload",  label='Select Object in Refernce Image', type="numpy", tool="sketch")
-            ref_img.change(fn=init_ref_canvas_wrapper, inputs=[img_edit, ref_img], outputs=[reference_mask],  queue=False)
         with gr.Row():
-            prompt = gr.Textbox(label="Prompt", value='A picture of truck')
-            with gr.Column():
-                interpolation = gr.Slider(label="Mixing ratio of appearance from reference object", minimum=0.1, maximum=1, value=1.0, step=0.1)
-                whole_ref = gr.Checkbox(label='Use whole reference Image for appearance (Only useful for style transfers)', value=False)
         with gr.Row():
             run_button = gr.Button(label="Run")
         with gr.Row():
             result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=4, height='auto')
         with gr.Accordion("Advanced options", open=False):
-            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
             strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
             guess_mode = gr.Checkbox(label='Guess Mode', value=False)
             ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-            scale_t = gr.Slider(label="Guidance Scale Text", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-            scale_f = gr.Slider(label="Guidance Scale Appearance", minimum=0.1, maximum=30.0, value=8.0, step=0.1)
-            scale_s = gr.Slider(label="Guidance Scale Structure", minimum=0.1, maximum=30.0, value=5.0, step=0.1)
             seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
             eta = gr.Number(label="eta (DDIM)", value=0.0)
             masking = gr.Checkbox(label='Only edit the local region', value=True)
             a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
             n_prompt = gr.Textbox(label="Negative Prompt",
                                     value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
         with gr.Column():
             gr.Examples(
-                examples=[['A picture of a truck', 'assets/truck.png','assets/truck2.jpeg', 892905419, 9, 7.6, 4.3],
-                          ['A picture of a ironman', 'assets/ironman.webp','assets/hulk.jpeg', 709736989, 9, 7.7, 8.1],
-                          ['A person skiing', 'assets/ski.jpg','assets/lava.jpg', 917723061, 9, 7.5, 4.4]],
-                inputs=[prompt, input_image, ref_img, seed, scale_t, scale_f, scale_s],
                 outputs=None,
                 fn=None,
                 cache_examples=False,
             )
-    ips = [input_mask, reference_mask, prompt, a_prompt, n_prompt, num_samples, ddim_steps, guess_mode, strength,
-                scale_s, scale_f, scale_t, seed, eta, masking, whole_ref, interpolation]
     run_button.click(fn=process_wrapper, inputs=[img_edit, *ips], outputs=[result_gallery])
-def create_struct_demo():
     with gr.Row():
-        gr.Markdown("## Edit Structure (Comming soon!)")
-def create_both_demo():
     with gr.Row():
-        gr.Markdown("## Edit Structure and Appearance Together (Comming soon!)")
-block = gr.Blocks(css=css).queue()
 with block:
     gr.HTML(
             """
             <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
             <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
-                PAIR Diffusion
             </h1>
-            <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem">
-            <a href="https://vidit98.github.io/" style="color:blue;">Vidit Goel</a><sup>1*</sup>,
-            <a href="https://helia95.github.io/" style="color:blue;">Elia Peruzzo</a><sup>1,2*</sup>,
-            <a href="https://yifanjiang19.github.io/" style="color:blue;">Yifan Jiang</a><sup>3</sup>,
-            <a href="https://ir1d.github.io/" style="color:blue;">Dejia Xu</a><sup>3</sup>,
-            <a href="http://disi.unitn.it/~sebe/" style="color:blue;">Nicu Sebe</a><sup>2</sup>, <br>
-            <a href=" https://people.eecs.berkeley.edu/~trevor/" style="color:blue;">Trevor Darrell</a><sup>4</sup>,
-            <a href="https://vita-group.github.io/" style="color:blue;">Zhangyang Wang</a><sup>1,3</sup>
-            and <a href="https://www.humphreyshi.com/home" style="color:blue;">Humphrey Shi</a> <sup>1,5,6</sup> <br>
-            [<a href="https://arxiv.org/abs/2303.17546" style="color:red;">arXiv</a>]
-            [<a href="https://github.com/Picsart-AI-Research/PAIR-Diffusion" style="color:red;">GitHub</a>]
-            </h2>
-            <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
-            <sup>1</sup>Picsart AI Resarch (PAIR), <sup>2</sup>UTrenton,  <sup>3</sup>UT Austin, <sup>4</sup>UC Berkeley, <sup>5</sup>UOregon, <sup>6</sup>UIUC
-            </h3>
             <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
-            We built Structure and Appearance Paired (PAIR) Diffusion  that allows reference image-guided appearance manipulation and
-            structure editing of an image at an object level. PAIR diffusion models an image as composition of multiple objects and enables control
-            over structure and appearance properties of the object. Describing object appearances using text can be challenging and ambiguous, PAIR Diffusion
-            enables a user to control the appearance of an object using images. User can further use text as another degree of control for appearance.
-            Having fine-grained control over appearance and structure at object level can be beneficial for future works in video and 3D beside image editing,
-            where we need to have consistent appearance across time in case of video or across various viewing positions in case of 3D.
             </h2>
             </div>
             """)
-    gr.HTML("""
-            <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
-            <br/>
-            <a href="https://huggingface.co/spaces/PAIR/PAIR-Diffusion?duplicate=true">
-            <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-            </p>""")
     with gr.Tab('Edit Appearance'):
         create_app_demo()
-    with gr.Tab('Edit Structure'):
-        create_struct_demo()
-    with gr.Tab('Edit Both'):
-        create_both_demo()
 block.queue(max_size=20)
-block.launch(debug=True)

 import gradio as gr
+from pair_diff_demo import ImageComp
+# torch.cuda.set_per_process_memory_fraction(0.6)
+def init_input_canvas_wrapper(obj, *args):
+    return obj.init_input_canvas(*args)
+def init_ref_canvas_wrapper(obj, *args):
+    return obj.init_ref_canvas(*args)
+def select_input_object_wrapper(obj, evt: gr.SelectData):
+    return obj.select_input_object(evt)
+def select_ref_object_wrapper(obj, evt: gr.SelectData):
+    return obj.select_ref_object(evt)
+def process_wrapper(obj, *args):
+    return obj.process(*args)
+def set_multi_modal_wrapper(obj, *args):
+    return obj.set_multi_modal(*args)
+def save_result_wrapper(obj, *args):
+    return obj.save_result(*args)
+def return_input_img_wrapper(obj):
+    return obj.return_input_img()
+def get_caption_wrapper(obj, *args):
+    return obj.get_caption(*args)
+def multimodal_params(b):
+    if b:
+        return 10, 3, 6
+    else:
+        return 6, 8, 9
+theme = gr.themes.Soft(
+    primary_hue="purple",
+    font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "ui-monospace", "Consolas", 'monospace'],
+).set(
+    block_label_background_fill_dark='*neutral_800'
+)
+css = """
+    #customized_imbox {
+        min-height: 450px;
+    }
+    #customized_imbox>div[data-testid="image"] {
+        min-height: 450px;
+    }
+    #customized_imbox>div[data-testid="image"]>div {
+        min-height: 450px;
+    }
+    #customized_imbox>div[data-testid="image"]>iframe {
+        min-height: 450px;
+    }
+    #customized_imbox>div.unpadded_box {
+        min-height: 450px;
+    }
+    #myinst {
+        font-size: 0.8rem;
+        margin: 0rem;
+        color: #6B7280;
+    }
+    #maskinst {
+        text-align: justify;
+        min-width: 1200px;
+    }
+    #maskinst>img {
+        min-width:399px;
+        max-width:450px;
+        vertical-align: top;
+        display: inline-block;
+    }
+    #maskinst:after {
+        content: "";
+        width: 100%;
+        display: inline-block;
+    }
+"""
+def create_app_demo():
+    with gr.Row():
+        gr.Markdown("## Object Level Appearance Editing")
+    with gr.Row():
+        gr.HTML(
+            """
+            <div style="text-align: left; max-width: 1200px;">
+            <h3 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
+            Instructions </h3>
+            <ol>
+                <li>Upload an Input Image.</li>
+                <li>Mark one of segmented objects in the <i>Select Object to Edit</i> tab.</li>
+                <li>Upload an Reference Image.</li>
+                <li>Mark  one of segmented objects in the <i>Select Reference Object</i> tab, whose appearance needs to used in the selected input object.</li>
+                <li>Enter a prompt and press <i>Run</i> button. (A very simple would also work) </li>
+            </ol>
+            </ol>
+            </div>""")
+    with gr.Column():
+        with gr.Row():
+            img_edit = gr.State(ImageComp('edit_app'))
+            with gr.Column():
+                input_image = gr.Image(source='upload', label='Input Image', type="numpy",)
+            with gr.Column():
+                input_mask = gr.Image(source="upload",  label='Select Object in Input Image', type="numpy",)
+            with gr.Column():
+                ref_img = gr.Image(source='upload', label='Reference Image', type="numpy")
+            with gr.Column():
+                reference_mask = gr.Image(source="upload",  label='Select Object in Refernce Image', type="numpy")
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt", value='A picture of truck')
+                mulitmod = gr.Checkbox(label='Multi-Modal', value=False)
+            mulitmod.change(fn=set_multi_modal_wrapper, inputs=[img_edit, mulitmod])
+            input_image.change(fn=init_input_canvas_wrapper, inputs=[img_edit, input_image],  outputs=[input_image], show_progress=True)
+            input_image.select(fn=select_input_object_wrapper, inputs=[img_edit], outputs=[input_mask, prompt])
+            ref_img.change(fn=init_ref_canvas_wrapper, inputs=[img_edit, ref_img],  outputs=[ref_img], show_progress=True)
+            ref_img.select(fn=select_ref_object_wrapper, inputs=[img_edit], outputs=[reference_mask])
+            with gr.Column():
+                interpolation = gr.Slider(label="Mixing ratio of appearance from reference object", minimum=0.1, maximum=1, value=1.0, step=0.1)
+                whole_ref = gr.Checkbox(label='Use whole reference Image for appearance (Only useful for style transfers)', visible=False)
+            # clear_button.click(fn=img_edit.clear_points, inputs=[], outputs=[input_mask, reference_mask])
+        with gr.Row():
+            run_button = gr.Button(label="Run")
+            save_button = gr.Button("Save")
+        with gr.Row():
+            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=4, height='auto')
+        with gr.Accordion("Advanced options", open=False):
+            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=4, step=1)
+            image_resolution = gr.Slider(label="Image Resolution", minimum=512, maximum=512, value=512, step=64)
+            strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+            guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+            ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+            scale_t = gr.Slider(label="Guidance Scale Text", minimum=0., maximum=30.0, value=6.0, step=0.1)
+            scale_f = gr.Slider(label="Guidance Scale Appearance", minimum=0., maximum=30.0, value=8.0, step=0.1)
+            scale_s = gr.Slider(label="Guidance Scale Structure", minimum=0., maximum=30.0, value=9.0, step=0.1)
+            seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+            eta = gr.Number(label="eta (DDIM)", value=0.0)
+            masking = gr.Checkbox(label='Only edit the local region', value=True)
+            a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+            n_prompt = gr.Textbox(label="Negative Prompt",
+                                    value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+            dil = gr.Slider(label="Merging region around Edge", minimum=0, maximum=0, value=0, step=0)
+        with gr.Column():
+            gr.Examples(
+                examples=[['assets/examples/car.jpeg','assets/examples/ian.jpeg', '', 709736989, 6, 8, 9],
+                          ['assets/examples/ian.jpeg','assets/examples/car.jpeg', '', 709736989, 6, 8, 9],
+                          ['assets/examples/car.jpeg','assets/examples/ran.webp', '', 709736989, 6, 8, 9],
+                          ['assets/examples/car.jpeg','assets/examples/car1.webp', '', 709736989, 6, 8, 9],
+                          ['assets/examples/car1.webp','assets/examples/car.jpeg', '', 709736989, 6, 8, 9],
+                           ['assets/examples/chair.jpeg','assets/examples/chair1.jpeg', '', 1106204668, 6, 8, 9],
+                           ['assets/examples/house.jpeg','assets/examples/house2.jpeg', '', 1106204668, 6, 8, 9],
+                           ['assets/examples/house2.jpeg','assets/examples/house.jpeg', '', 1106204668, 6, 8, 9],
+                           ['assets/examples/park.webp','assets/examples/grasslands-national-park.jpeg', '', 1106204668, 6, 8, 9],
+                           ['assets/examples/door.jpeg','assets/examples/door2.jpeg', '', 709736989, 6, 8, 9]],
+                inputs=[input_image, ref_img, prompt, seed, scale_t, scale_f, scale_s],
+                cache_examples=False,
+            )
+        mulitmod.change(fn=multimodal_params, inputs=[mulitmod], outputs=[scale_t, scale_f, scale_s])
+    ips = [input_mask, reference_mask, prompt, a_prompt, n_prompt, num_samples, ddim_steps, guess_mode, strength,
+                scale_s, scale_f, scale_t, seed, eta, dil, masking, whole_ref, interpolation]
+    ips_save = [input_mask, prompt, a_prompt, n_prompt, ddim_steps,
+                scale_s, scale_f, scale_t, seed, dil, interpolation]
+    run_button.click(fn=process_wrapper, inputs=[img_edit, *ips], outputs=[result_gallery])
+    save_button.click(fn=save_result_wrapper, inputs=[img_edit, *ips_save])
+def create_add_obj_demo():
+    with gr.Row():
+        gr.Markdown("## Add Objects to Image")
+    with gr.Row():
+        gr.HTML(
+            """
+            <div style="text-align: left; max-width: 1200px;">
+            <h3 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
+            Instructions </h3>
+            <ol>
+                <li> Upload an Input Image.</li>
+                <li>Draw the precise shape of object in the image where you want to add object in <i>Draw Object</i> tab.</li>
+                <li>Upload an Reference Image.</li>
+                <li>Click on the object in the Reference Image tab that you want to add in the Input Image.</li>
+                <li>Enter a prompt and press <i>Run</i> button. (A very simple would also work) </li>
+            </ol>
+            </ol>
+            </div>""")
+    with gr.Column():
+        with gr.Row():
+            img_edit = gr.State(ImageComp('add_obj'))
+            with gr.Column():
+                input_image = gr.Image(source='upload', label='Input Image', type="numpy",)
+            with gr.Column():
+                input_mask = gr.Image(source="upload",  label='Draw the desired Object', type="numpy",  tool="sketch")
+            input_image.change(fn=init_input_canvas_wrapper, inputs=[img_edit, input_image],  outputs=[input_image])
+            input_image.change(fn=return_input_img_wrapper, inputs=[img_edit], outputs=[input_mask],  queue=False)
+            with gr.Column():
+                ref_img = gr.Image(source='upload', label='Reference Image', type="numpy")
+            with gr.Column():
+                reference_mask = gr.Image(source="upload",  label='Selected Object in Refernce Image', type="numpy")
+            ref_img.change(fn=init_ref_canvas_wrapper, inputs=[img_edit, ref_img], outputs=[ref_img],  queue=False)
+            # ref_img.upload(fn=img_edit.init_ref_canvas, inputs=[ref_img],  outputs=[ref_img])
+            ref_img.select(fn=select_ref_object_wrapper, inputs=[img_edit], outputs=[reference_mask])
+        with gr.Row():
+            prompt = gr.Textbox(label="Prompt", value='A picture of truck')
+            mulitmod = gr.Checkbox(label='Multi-Modal', value=False, visible=False)
+        mulitmod.change(fn=set_multi_modal_wrapper, inputs=[img_edit, mulitmod])
+        with gr.Row():
+            run_button = gr.Button(label="Run")
+            save_button = gr.Button("Save")
+        with gr.Row():
+            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=4, height='auto')
+        with gr.Accordion("Advanced options", open=False):
+            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=4, step=1)
+            # image_resolution = gr.Slider(label="Image Resolution", minimum=512, maximum=512, value=512, step=64)
+            strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+            guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+            ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+            dil = gr.Slider(label="Merging region around Edge", minimum=0, maximum=5, value=2, step=1)
+            scale_t = gr.Slider(label="Guidance Scale Text", minimum=0., maximum=30.0, value=6.0, step=0.1)
+            scale_f = gr.Slider(label="Guidance Scale Appearance", minimum=0., maximum=30.0, value=8.0, step=0.1)
+            scale_s = gr.Slider(label="Guidance Scale Structure", minimum=0., maximum=30.0, value=9.0, step=0.1)
+            seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+            eta = gr.Number(label="eta (DDIM)", value=0.0)
+            masking = gr.Checkbox(label='Only edit the local region', value=True)
+            a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+            n_prompt = gr.Textbox(label="Negative Prompt",
+                                    value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+        mulitmod.change(fn=multimodal_params, inputs=[mulitmod], outputs=[scale_t, scale_f, scale_s])
+        with gr.Column():
+            gr.Examples(
+                examples=[['assets/examples/chair.jpeg','assets/examples/carpet2.webp', 'A picture of  living room with carpet', 892905419, 6, 8, 9],
+                          ['assets/examples/chair.jpeg','assets/examples/chair1.jpeg', 'A picture of  living room with a orange and white sofa', 892905419, 6, 8, 9],
+                          ['assets/examples/park.webp','assets/examples/dog.jpeg', 'A picture of  dog in the park', 892905419, 6, 8, 9]],
+                inputs=[input_image, ref_img, prompt, seed, scale_t, scale_f, scale_s],
+                outputs=None,
+                fn=None,
+                cache_examples=False,
+            )
+    ips = [input_mask, reference_mask, prompt, a_prompt, n_prompt, num_samples, ddim_steps, guess_mode, strength,
+                scale_s, scale_f, scale_t, seed, eta, dil, masking]
+    ips_save = [input_mask, prompt, a_prompt, n_prompt, ddim_steps,
+                scale_s, scale_f, scale_t, seed, dil]
+    run_button.click(fn=process_wrapper, inputs=[img_edit, *ips], outputs=[result_gallery])
+    save_button.click(fn=save_result_wrapper, inputs=[img_edit, *ips_save])
+def create_obj_variation_demo():
     with gr.Row():
+        gr.Markdown("## Objects Variation")
     with gr.Row():
         gr.HTML(
             """
+            <div style="text-align: left; max-width: 1200px;">
+            <h3 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
+            Instructions </h3>
+            <ol>
+                <li> Upload an Input Image.</li>
+                <li>Click on object to have variations</li>
+                <li>Press <i>Run</i> button</li>
+            </ol>
+            </ol>
+            </div>""")
     with gr.Column():
         with gr.Row():
             img_edit = gr.State(ImageComp('edit_app'))
             with gr.Column():
                 input_image = gr.Image(source='upload', label='Input Image', type="numpy",)
             with gr.Column():
+                input_mask = gr.Image(source="upload",  label='Select Object in Input Image', type="numpy",)
         with gr.Row():
+            prompt = gr.Textbox(label="Prompt", value='')
+            mulitmod = gr.Checkbox(label='Multi-Modal', value=False)
+        mulitmod.change(fn=set_multi_modal_wrapper, inputs=[img_edit, mulitmod])
+        input_image.change(fn=init_input_canvas_wrapper, inputs=[img_edit, input_image],  outputs=[input_image])
+        input_image.select(fn=select_input_object_wrapper, inputs=[img_edit], outputs=[input_mask, prompt])
+        input_image.change(fn=init_ref_canvas_wrapper, inputs=[img_edit, input_image], outputs=[],  queue=False)
+        input_image.select(fn=select_ref_object_wrapper, inputs=[img_edit], outputs=[])
         with gr.Row():
             run_button = gr.Button(label="Run")
+            save_button = gr.Button("Save")
         with gr.Row():
             result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=4, height='auto')
         with gr.Accordion("Advanced options", open=False):
+            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=4, step=2)
+            # image_resolution = gr.Slider(label="Image Resolution", minimum=512, maximum=512, value=512, step=64)
             strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
             guess_mode = gr.Checkbox(label='Guess Mode', value=False)
             ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+            dil = gr.Slider(label="Merging region around Edge", minimum=0, maximum=5, value=2, step=1)
+            scale_t = gr.Slider(label="Guidance Scale Text", minimum=0.0, maximum=30.0, value=6.0, step=0.1)
+            scale_f = gr.Slider(label="Guidance Scale Appearance", minimum=0.0, maximum=30.0, value=8.0, step=0.1)
+            scale_s = gr.Slider(label="Guidance Scale Structure", minimum=0.0, maximum=30.0, value=9.0, step=0.1)
             seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
             eta = gr.Number(label="eta (DDIM)", value=0.0)
             masking = gr.Checkbox(label='Only edit the local region', value=True)
             a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
             n_prompt = gr.Textbox(label="Negative Prompt",
                                     value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+        mulitmod.change(fn=multimodal_params, inputs=[mulitmod], outputs=[scale_t, scale_f, scale_s])
         with gr.Column():
             gr.Examples(
+                examples=[['assets/examples/chair.jpeg' , 892905419, 6, 8, 9],
+                          ['assets/examples/chair1.jpeg', 892905419, 6, 8, 9],
+                          ['assets/examples/park.webp',  892905419, 6, 8, 9],
+                          ['assets/examples/car.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/ian.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/chair.jpeg', 1106204668, 6, 8, 9],
+                          ['assets/examples/door.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/carpet2.webp',  892905419, 6, 8, 9],
+                          ['assets/examples/house.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/house2.jpeg', 709736989, 6, 8, 9],],
+                inputs=[input_image, seed, scale_t, scale_f, scale_s],
                 outputs=None,
                 fn=None,
                 cache_examples=False,
             )
+    ips = [input_mask, input_mask, prompt, a_prompt, n_prompt, num_samples, ddim_steps, guess_mode, strength,
+                scale_s, scale_f, scale_t, seed, eta, dil, masking]
+    ips_save = [input_mask, prompt, a_prompt, n_prompt, ddim_steps,
+                scale_s, scale_f, scale_t, seed, dil]
     run_button.click(fn=process_wrapper, inputs=[img_edit, *ips], outputs=[result_gallery])
+    save_button.click(fn=save_result_wrapper, inputs=[img_edit, *ips_save])
+def create_free_form_obj_variation_demo():
     with gr.Row():
+        gr.Markdown("## Objects Variation")
     with gr.Row():
+        gr.HTML(
+            """
+            <div style="text-align: left; max-width: 1200px;">
+            <h3 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
+            Instructions </h3>
+            <ol>
+                <li> Upload an Input Image.</li>
+                <li>Mask the region that you want to have variation</li>
+                <li>Press <i>Run</i> button</li>
+            </ol>
+            </ol>
+            </div>""")
+    with gr.Column():
+        with gr.Row():
+            img_edit = gr.State(ImageComp('edit_app'))
+            with gr.Column():
+                input_image = gr.Image(source='upload', label='Input Image', type="numpy", )
+            with gr.Column():
+                input_mask = gr.Image(source="upload",  label='Select Object in Input Image', type="numpy", tool="sketch")
+        with gr.Row():
+            prompt = gr.Textbox(label="Prompt", value='')
+            ignore_structure = gr.Checkbox(label='Ignore Structure (Please provide a good caption)', visible=False)
+            mulitmod = gr.Checkbox(label='Multi-Modal', value=False)
+        mulitmod.change(fn=set_multi_modal_wrapper, inputs=[img_edit, mulitmod])
+        input_image.change(fn=init_input_canvas_wrapper, inputs=[img_edit, input_image],  outputs=[input_mask])
+        input_mask.edit(fn=get_caption_wrapper, inputs=[img_edit, input_mask], outputs=[prompt])
+        input_image.change(fn=init_ref_canvas_wrapper, inputs=[img_edit, input_image], outputs=[],  queue=False)
+        # input_image.select(fn=select_ref_object_wrapper, inputs=[img_edit], outputs=[])
+        # input_image.edit(fn=img_edit.vis_mask, inputs=[input_image], outputs=[input_mask])
+        with gr.Row():
+            run_button = gr.Button(label="Run")
+            save_button = gr.Button("Save")
+        with gr.Row():
+            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=4, height='auto')
+        with gr.Accordion("Advanced options", open=False):
+            num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=4, step=2)
+            # image_resolution = gr.Slider(label="Image Resolution", minimum=512, maximum=512, value=512, step=64)
+            strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+            guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+            ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+            dil = gr.Slider(label="Merging region around Edge", minimum=0, maximum=5, value=2, step=1)
+            scale_t = gr.Slider(label="Guidance Scale Text", minimum=0.0, maximum=30.0, value=6.0, step=0.1)
+            scale_f = gr.Slider(label="Guidance Scale Appearance", minimum=0.0, maximum=30.0, value=8.0, step=0.1)
+            scale_s = gr.Slider(label="Guidance Scale Structure", minimum=0.0, maximum=30.0, value=9.0, step=0.1)
+            seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+            eta = gr.Number(label="eta (DDIM)", value=0.0)
+            masking = gr.Checkbox(label='Only edit the local region', value=True)
+            free_form_obj_var = gr.Checkbox(label='', value=True)
+            a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+            n_prompt = gr.Textbox(label="Negative Prompt",
+                                    value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+            interpolation = gr.Slider(label="Mixing ratio of appearance from reference object", minimum=0.0, maximum=0.1, step=0.1)
+        mulitmod.change(fn=multimodal_params, inputs=[mulitmod], outputs=[scale_t, scale_f, scale_s])
+        with gr.Column():
+            gr.Examples(
+                examples=[['assets/examples/chair.jpeg' , 892905419, 6, 8, 9],
+                          ['assets/examples/chair1.jpeg', 892905419, 6, 8, 9],
+                          ['assets/examples/park.webp',  892905419, 6, 8, 9],
+                          ['assets/examples/car.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/ian.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/chair.jpeg', 1106204668, 6, 8, 9],
+                          ['assets/examples/door.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/carpet2.webp',  892905419, 6, 8, 9],
+                          ['assets/examples/house.jpeg',  709736989, 6, 8, 9],
+                          ['assets/examples/house2.jpeg', 709736989, 6, 8, 9],],
+                inputs=[input_image, seed, scale_t, scale_f, scale_s],
+                outputs=None,
+                fn=None,
+                cache_examples=False,
+            )
+    ips = [input_mask, input_mask, prompt, a_prompt, n_prompt, num_samples, ddim_steps, guess_mode, strength,
+                scale_s, scale_f, scale_t, seed, eta, dil, masking, free_form_obj_var, dil, free_form_obj_var, ignore_structure]
+    ips_save = [input_mask, prompt, a_prompt, n_prompt, ddim_steps,
+                scale_s, scale_f, scale_t, seed, dil, interpolation, free_form_obj_var]
+    run_button.click(fn=process_wrapper, inputs=[img_edit, *ips], outputs=[result_gallery])
+    save_button.click(fn=save_result_wrapper, inputs=[img_edit, *ips_save])
+block = gr.Blocks(css=css, theme=theme).queue()
 with block:
     gr.HTML(
             """
             <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
             <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
+                PAIR Diffusion: A Comprehensive Multimodal Object-Level Image Editor
             </h1>
+            <h3 style="margin-top: 0.6rem; margin-bottom: 1rem">Picsart AI Research</h3>
             <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.8rem; margin-bottom: 0.8rem">
+            PAIR diffusion provides comprehensive multi-modal editing capabilities to edit real images without the need of inverting them. The current suite contains
+         <span style="color: #01feee;">Object Variation</span>, <span style="color: #4f82d9;">Edit Appearance of any object using a reference image and text</span>,
+            <span style="color: #d402bf;">Add any object from a reference image in the input image</span>. This operations can be mixed with each other to
+            develop new editing operations in future.
+            </ul>
             </h2>
             </div>
             """)
     with gr.Tab('Edit Appearance'):
         create_app_demo()
+    with gr.Tab('Object Variation Free Form Mask'):
+        create_free_form_obj_variation_demo()
+    with gr.Tab('Object Variation'):
+        create_obj_variation_demo()
+    with gr.Tab('Add Objects'):
+        create_add_obj_demo()
 block.queue(max_size=20)
+block.launch(share=True)

assets/GIF.gif CHANGED Viewed

Git LFS Details

SHA256: e720b8c82526a982014b3eee781ba5d1a42c104e380444c536e4bbee21101a65
Pointer size: 131 Bytes
Size of remote file: 370 kB

assets/Teaser_Small.png ADDED Viewed

Git LFS Details

SHA256: dc29a44a9ddd8ec91b114b09b1229b1eb8d0740874a93e1a7d9ff92d7327b0b1
Pointer size: 131 Bytes
Size of remote file: 862 kB

assets/examples/Lancia.webp ADDED Viewed

Git LFS Details

SHA256: 628010d440fafc6d5e61691b543e7dd59bc11c76ec0d48b36890a96c22abc8a4
Pointer size: 131 Bytes
Size of remote file: 148 kB

assets/examples/car.jpeg ADDED Viewed

Git LFS Details

SHA256: 71a73a4ec6eab9e075eaa59879a884e0f663ad28d548ce8cf2e604166346874e
Pointer size: 131 Bytes
Size of remote file: 101 kB

assets/examples/car1.webp ADDED Viewed

Git LFS Details

SHA256: c66b53e2f266d68f964574ba9d51dd70dbab478b63905a993f3784beb67bd3b7
Pointer size: 131 Bytes
Size of remote file: 122 kB

assets/examples/carpet2.webp ADDED Viewed

Git LFS Details

SHA256: 0bc055513cfcbfae7320e829fb84697c5a9d649edecd17652ddf54a77522af3c
Pointer size: 130 Bytes
Size of remote file: 69.4 kB

assets/examples/chair.jpeg ADDED Viewed

Git LFS Details

SHA256: d9d0040bcd7275bea283432c43abb69cdbe1fc32ff04c3994153d780057791b8
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

assets/examples/chair1.jpeg ADDED Viewed

Git LFS Details

SHA256: 55af4d3b00a2ec95638bc4703adbca8409cf58ae19342326d0c0eac191179dcb
Pointer size: 129 Bytes
Size of remote file: 6.49 kB

assets/examples/dog.jpeg ADDED Viewed

Git LFS Details

SHA256: aa2d7acb2a06243b753d56306f45e22aae6e5b02bdf966ee7466ba517153cc11
Pointer size: 131 Bytes
Size of remote file: 278 kB

assets/examples/door.jpeg ADDED Viewed

Git LFS Details

SHA256: 424cc31f30b29060b8869c2ccb62c2f60010088b0fc3e9d8ea53d04fc21dbfbe
Pointer size: 130 Bytes
Size of remote file: 46.9 kB

assets/examples/door2.jpeg ADDED Viewed

Git LFS Details

SHA256: 32022ee30272376935e44df622d478c13d68686071ada7fe60ae36ffe44167da
Pointer size: 131 Bytes
Size of remote file: 540 kB

assets/examples/grasslands-national-park.jpeg ADDED Viewed

Git LFS Details

SHA256: 26690739225241d173d04b21661809dda464e59ac2af8da73178883094e508b6
Pointer size: 130 Bytes
Size of remote file: 66.1 kB

assets/examples/house.jpeg ADDED Viewed

Git LFS Details

SHA256: 89268b6097908e97cc8a56df3824ac6c589d86e415d2b1954e65255f1eddb595
Pointer size: 131 Bytes
Size of remote file: 194 kB

assets/examples/house2.jpeg ADDED Viewed

Git LFS Details

SHA256: 753bbfa58d471be54f23b029d6764bd8682f77ff5c7f375b71ac8a10cb28342b
Pointer size: 131 Bytes
Size of remote file: 101 kB

assets/examples/ian.jpeg ADDED Viewed

Git LFS Details

SHA256: b59d82f9b8cd2cc5a7864d85a1b0b51ad381c13370c6b70210b4ad1a267a9478
Pointer size: 131 Bytes
Size of remote file: 386 kB

assets/examples/park.webp ADDED Viewed

Git LFS Details

SHA256: d65c17257c64a793fa92311c91e2d9f31ec5d704ca644bbf3fb942de9769526e
Pointer size: 131 Bytes
Size of remote file: 731 kB

assets/examples/ran.webp ADDED Viewed

Git LFS Details

SHA256: 8f92f72d0a1286ff77cce5cdbd78fd1455fb5041c91ca5729f00055441bae13a
Pointer size: 131 Bytes
Size of remote file: 183 kB

assets/hulk.jpeg CHANGED Viewed

Git LFS Details

SHA256: e7b2163b45349d71b40ac92b24e5dfa8559dcce5449c41740bf344d1a445e287
Pointer size: 130 Bytes
Size of remote file: 76.5 kB

assets/ironman.webp CHANGED Viewed

Git LFS Details

SHA256: 005c4adf045975ec4a328664e65258078ed90efe3e30bceba863f3c187404cc4
Pointer size: 130 Bytes
Size of remote file: 94.9 kB

assets/lava.jpg CHANGED Viewed

Git LFS Details

SHA256: 16cd431ad032a8058f6d6142e2e24d6cc7848837c44df50465085be875c931b3
Pointer size: 131 Bytes
Size of remote file: 102 kB

assets/ski.jpg CHANGED Viewed

Git LFS Details

SHA256: c8f11aa5fdcbf78a3647a56a59c0cb4eb0d6da1dc83fc4ad247ad75e347a7476
Pointer size: 131 Bytes
Size of remote file: 200 kB

assets/truck.png CHANGED Viewed

Git LFS Details

SHA256: 86a0fa5c1d24bddd54db9631e717bcf56cac4f083e16d399c2495f4c766e4a9c
Pointer size: 130 Bytes
Size of remote file: 71.1 kB

assets/truck2.jpeg CHANGED Viewed

Git LFS Details

SHA256: 79ee52c4ab698b0702d34b7f6216db4899dcf08f5402d3e5f61b8e0f6408821a
Pointer size: 131 Bytes
Size of remote file: 199 kB

cldm/appearance_networks.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Neighborhood Attention Transformer.
+https://arxiv.org/abs/2204.07143
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torchvision
+import torch.nn as nn
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+class VGGPerceptualLoss(torch.nn.Module):
+    def __init__(self, resize=True):
+        super(VGGPerceptualLoss, self).__init__()
+        blocks = []
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[:4].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[4:9].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[9:16].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[16:23].eval())
+        for bl in blocks:
+            for p in bl.parameters():
+                p.requires_grad = False
+        self.blocks = torch.nn.ModuleList(blocks)
+        self.transform = torch.nn.functional.interpolate
+        self.resize = resize
+        self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    def forward(self, input, appearance_layers=[0,1,2,3]):
+        if input.shape[1] != 3:
+            input = input.repeat(1, 3, 1, 1)
+            target = target.repeat(1, 3, 1, 1)
+        input = (input-self.mean) / self.std
+        if self.resize:
+            input = self.transform(input, mode='bilinear', size=(224, 224), align_corners=False)
+        x = input
+        feats = []
+        for i, block in enumerate(self.blocks):
+            x = block(x)
+            if i in appearance_layers:
+                feats.append(x)
+        return  feats
+class DINOv2(torch.nn.Module):
+    def __init__(self, resize=True, size=224, model_type='dinov2_vitl14'):
+        super(DINOv2, self).__init__()
+        self.size=size
+        self.resize = resize
+        self.transform = torch.nn.functional.interpolate
+        self.model = torch.hub.load('facebookresearch/dinov2', model_type)
+        self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    def forward(self, input, appearance_layers=[1,2]):
+        if input.shape[1] != 3:
+            input = input.repeat(1, 3, 1, 1)
+            target = target.repeat(1, 3, 1, 1)
+        if self.resize:
+            input = self.transform(input, mode='bicubic', size=(self.size, self.size), align_corners=False)
+        # mean = torch.tensor(IMAGENET_MEAN).view(1, 3, 1, 1).to(input.device)
+        # std = torch.tensor(IMAGENET_STD).view(1, 3, 1, 1).to(input.device)
+        input = (input-self.mean) / self.std
+        feats = self.model.get_intermediate_layers(input, self.model.n_blocks, reshape=True)
+        feats = [f.detach() for f in feats]
+        return feats

cldm/cldm.py CHANGED Viewed

@@ -10,7 +10,6 @@ from ldm.modules.diffusionmodules.util import (
     zero_module,
     timestep_embedding,
 )
-import torchvision
 from einops import rearrange, repeat
 from torchvision.utils import make_grid
 from ldm.modules.attention import SpatialTransformer
@@ -18,46 +17,9 @@ from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSeq
 from ldm.models.diffusion.ddpm import LatentDiffusion
 from ldm.util import log_txt_as_img, exists, instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
-class VGGPerceptualLoss(torch.nn.Module):
-    def __init__(self, resize=True):
-        super(VGGPerceptualLoss, self).__init__()
-        blocks = []
-        vgg_model = torchvision.models.vgg16(pretrained=True)
-        print('Loaded VGG weights')
-        blocks.append(vgg_model.features[:4].eval())
-        blocks.append(vgg_model.features[4:9].eval())
-        blocks.append(vgg_model.features[9:16].eval())
-        blocks.append(vgg_model.features[16:23].eval())
-        for bl in blocks:
-            for p in bl.parameters():
-                p.requires_grad = False
-        self.blocks = torch.nn.ModuleList(blocks)
-        self.transform = torch.nn.functional.interpolate
-        self.resize = resize
-        self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
-        self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
-        print('Initialized VGG model')
-    def forward(self, input, feature_layers=[0, 1, 2, 3], style_layers=[1,]):
-        if input.shape[1] != 3:
-            input = input.repeat(1, 3, 1, 1)
-            target = target.repeat(1, 3, 1, 1)
-        input = (input-self.mean) / self.std
-        if self.resize:
-            input = self.transform(input, mode='bilinear', size=(224, 224), align_corners=False)
-        x = input
-        gram_matrices_all = []
-        feats = []
-        for i, block in enumerate(self.blocks):
-            x = block(x)
-            if i in style_layers:
-                feats.append(x)
-        return  feats
 class ControlledUnetModel(UNetModel):
@@ -325,6 +287,7 @@ class ControlNet(nn.Module):
     def forward(self, x, hint, timesteps, context, **kwargs):
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
         guided_hint = self.input_hint_block(hint, emb, context, x.shape)
         outs = []
@@ -343,57 +306,6 @@ class ControlNet(nn.Module):
         outs.append(self.middle_block_out(h, emb, context))
         return outs
-class Interpolate(nn.Module):
-    def __init__(self, size, mode):
-        super(Interpolate, self).__init__()
-        self.interp = torch.nn.functional.interpolate
-        self.size = size
-        self.mode = mode
-        self.factor = 8
-    def forward(self, x):
-        h,w = x.shape[2]//self.factor,  x.shape[3]//self.factor
-        x = self.interp(x, size=(h,w), mode=self.mode)
-        return x
-class ControlNetSAP(ControlNet):
-    def __init__(
-            self,
-            hint_channels,
-            model_channels,
-            input_hint_block='fixed',
-            size = 64,
-            mode='nearest',
-            *args,
-            **kwargs
-    ):
-        super().__init__( hint_channels=hint_channels, model_channels=model_channels, *args, **kwargs)
-        #hint channels are atleast 128 dims
-        if input_hint_block == 'learnable':
-            ch = 2 ** (int(math.log2(hint_channels)))
-            self.input_hint_block = TimestepEmbedSequential(
-                conv_nd(self.dims, hint_channels, hint_channels, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(self.dims, hint_channels, 2*ch, 3, padding=1, stride=2),
-                nn.SiLU(),
-                conv_nd(self.dims, 2*ch, 2*ch, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(self.dims, 2*ch, 2*ch, 3, padding=1, stride=2),
-                nn.SiLU(),
-                conv_nd(self.dims, 2*ch, 2*ch, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(self.dims, 2*ch, model_channels, 3, padding=1, stride=2),
-                nn.SiLU(),
-                zero_module(conv_nd(self.dims, model_channels, model_channels, 3, padding=1))
-            )
-        else:
-            print("Only interpolation")
-            self.input_hint_block = TimestepEmbedSequential(
-                                    Interpolate(size, mode),
-                                    zero_module(conv_nd(self.dims, hint_channels, model_channels, 3, padding=1)))
 class ControlLDM(LatentDiffusion):
@@ -420,11 +332,11 @@ class ControlLDM(LatentDiffusion):
         diffusion_model = self.model.diffusion_model
         cond_txt = torch.cat(cond['c_crossattn'], 1)
         if cond['c_concat'] is None:
             eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control)
         else:
-            control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt)
             control = [c * scale for c, scale in zip(control, self.control_scales)]
             eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
@@ -443,7 +355,7 @@ class ControlLDM(LatentDiffusion):
         use_ddim = ddim_steps is not None
         log = dict()
-        z, c = self.get_input(batch, self.first_stage_key, bs=N)
         c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
         N = min(z.shape[0], N)
         n_row = min(z.shape[0], n_row)
@@ -498,8 +410,9 @@ class ControlLDM(LatentDiffusion):
     @torch.no_grad()
     def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
         ddim_sampler = DDIMSampler(self)
-        b, c, h, w = cond["c_concat"][0].shape
-        shape = (self.channels, h // 8, w // 8)
         samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
         return samples, intermediates
@@ -525,24 +438,54 @@ class ControlLDM(LatentDiffusion):
             self.cond_stage_model = self.cond_stage_model.cuda()
-class SAP(ControlLDM):
     @torch.no_grad()
-    def __init__(self,control_stage_config, control_key, only_mid_control, *args, **kwargs):
         super().__init__(control_stage_config=control_stage_config,
                             control_key=control_key,
                             only_mid_control=only_mid_control,
                             *args, **kwargs)
-        self.appearance_net = VGGPerceptualLoss().to(self.device)
-        print("Loaded VGG model")
-    def get_appearance(self, img, mask, return_all=False):
         img = (img + 1) * 0.5
-        feat = self.appearance_net(img)[0]
         empty_mask_flag = torch.sum(mask, dim=(1,2,3)) == 0
         empty_appearance = torch.zeros(feat.shape).to(self.device)
-        mask = torch.nn.functional.interpolate(mask.float(), (feat.shape[2:])).long()
         one_hot = torch.nn.functional.one_hot(mask[:,0]).permute(0,3,1,2).float()
         feat = torch.einsum('nchw, nmhw->nmchw', feat, one_hot)
@@ -552,32 +495,68 @@ class SAP(ControlLDM):
         mean_feat[:, 0] = torch.zeros(mean_feat[:,0].shape).to(self.device) #set edges in panopitc mask to empty appearance feature
         splatted_feat = torch.einsum('nmc, nmhw->nchw', mean_feat, one_hot)
-        splatted_feat[empty_mask_flag] = empty_appearance[empty_mask_flag]
         splatted_feat = torch.nn.functional.normalize(splatted_feat) #l2 normalize on c dim
         if return_all:
             return splatted_feat, mean_feat, one_hot, empty_mask_flag
         return splatted_feat
     def get_input(self, batch, k, bs=None, *args, **kwargs):
         z, c, x_orig, x_recon = super(ControlLDM, self).get_input(batch, self.first_stage_key, return_first_stage_outputs=True , *args, **kwargs)
         structure = batch['seg'].unsqueeze(1)
         mask =  batch['mask'].unsqueeze(1).to(self.device)
-        appearance = self.get_appearance(x_orig, mask)
         if bs is not None:
             structure = structure[:bs]
-            appearance = appearance[:bs]
         structure = structure.to(self.device)
-        appearance = appearance.to(self.device)
         structure = structure.to(memory_format=torch.contiguous_format).float()
-        appearance = appearance.to(memory_format=torch.contiguous_format).float()
-        structure = torch.nn.functional.interpolate(structure, x_orig.shape[2:])
-        appearance = torch.nn.functional.interpolate(appearance, x_orig.shape[2:])
-        control = torch.cat([structure, appearance], dim=1)
-        return z, dict(c_crossattn=[c], c_concat=[control])
     @torch.no_grad()
     def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
                    quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=False,
@@ -588,11 +567,14 @@ class SAP(ControlLDM):
         log = dict()
         z, c = self.get_input(batch, self.first_stage_key, bs=N)
-        c_cat, c = c["c_concat"][0][:N,], c["c_crossattn"][0][:N]
         N = min(z.shape[0], N)
         n_row = min(z.shape[0], n_row)
         log["reconstruction"] = self.decode_first_stage(z)
-        log["control"] = c_cat[:, :1]
         log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
         if plot_diffusion_rows:
@@ -634,7 +616,7 @@ class SAP(ControlLDM):
         if unconditional_guidance_scale > 1.0:
             uc_cross = self.get_unconditional_conditioning(N)
-            uc_cat = c_cat  # torch.zeros_like(c_cat)
             uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
             samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
                                              batch_size=N, ddim=use_ddim,
@@ -646,3 +628,18 @@ class SAP(ControlLDM):
             log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
         return log

     zero_module,
     timestep_embedding,
 )
 from einops import rearrange, repeat
 from torchvision.utils import make_grid
 from ldm.modules.attention import SpatialTransformer
 from ldm.models.diffusion.ddpm import LatentDiffusion
 from ldm.util import log_txt_as_img, exists, instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
+from cldm.appearance_networks import VGGPerceptualLoss, DINOv2
 class ControlledUnetModel(UNetModel):
     def forward(self, x, hint, timesteps, context, **kwargs):
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
+        # hint = hint[:,:-1]
         guided_hint = self.input_hint_block(hint, emb, context, x.shape)
         outs = []
         outs.append(self.middle_block_out(h, emb, context))
         return outs
 class ControlLDM(LatentDiffusion):
         diffusion_model = self.model.diffusion_model
         cond_txt = torch.cat(cond['c_crossattn'], 1)
         if cond['c_concat'] is None:
             eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control)
         else:
+            # control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt)
+            control = self.control_model(x=x_noisy, hint=cond['c_concat'][0], timesteps=t, context=cond_txt)
             control = [c * scale for c, scale in zip(control, self.control_scales)]
             eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
         use_ddim = ddim_steps is not None
         log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N, logging=True)
         c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
         N = min(z.shape[0], N)
         n_row = min(z.shape[0], n_row)
     @torch.no_grad()
     def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
         ddim_sampler = DDIMSampler(self)
+        b, c, h, w = cond["c_concat"][0][0].shape if isinstance(cond["c_concat"][0], list) else cond["c_concat"][0].shape
+        # shape = (self.channels, h // 8, w // 8)
+        shape = (self.channels, h, w)
         samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
         return samples, intermediates
             self.cond_stage_model = self.cond_stage_model.cuda()
+class PAIRDiffusion(ControlLDM):
     @torch.no_grad()
+    def __init__(self,control_stage_config, control_key, only_mid_control, app_net='vgg', app_layer_conc=(1,), app_layer_ca=(6,6,18,18),
+                 appearance_net_locked=True, concat_multi_app=False, train_structure_variation_only=False, instruct=False, *args, **kwargs):
         super().__init__(control_stage_config=control_stage_config,
                             control_key=control_key,
                             only_mid_control=only_mid_control,
                             *args, **kwargs)
+        self.appearance_net_conc = VGGPerceptualLoss().to(self.device)
+        self.appearance_net_ca = DINOv2().to(self.device)
+        self.appearance_net = VGGPerceptualLoss().to(self.device) #need to be removed no use
+        self.app_layer_conc = app_layer_conc
+        self.app_layer_ca = app_layer_ca
+    def get_appearance(self, net, layer, img, mask, return_all=False):
         img = (img + 1) * 0.5
+        feat = net(img)
+        splatted_feat = []
+        mean_feat = []
+        for fe_i in layer:
+            v = self.get_appearance_single(feat[fe_i], mask, return_all=return_all)
+            if return_all:
+                spl, me_f, one_hot, empty_mask = v
+                splatted_feat.append(spl)
+                mean_feat.append(me_f)
+            else:
+                splatted_feat.append(v)
+        if len(layer) == 1:
+            splatted_feat = splatted_feat[0]
+            # mean_feat = mean_feat[0]
+        del feat
+        if return_all:
+            return splatted_feat, mean_feat, one_hot, empty_mask
+        return splatted_feat
+    def get_appearance_single(self, feat, mask, return_all):
         empty_mask_flag = torch.sum(mask, dim=(1,2,3)) == 0
         empty_appearance = torch.zeros(feat.shape).to(self.device)
+        mask = torch.nn.functional.interpolate(mask.float(), size=(feat.shape[2], feat.shape[3])).long()
         one_hot = torch.nn.functional.one_hot(mask[:,0]).permute(0,3,1,2).float()
         feat = torch.einsum('nchw, nmhw->nmchw', feat, one_hot)
         mean_feat[:, 0] = torch.zeros(mean_feat[:,0].shape).to(self.device) #set edges in panopitc mask to empty appearance feature
         splatted_feat = torch.einsum('nmc, nmhw->nchw', mean_feat, one_hot)
+        splatted_feat[empty_mask_flag] = empty_appearance[empty_mask_flag]
         splatted_feat = torch.nn.functional.normalize(splatted_feat) #l2 normalize on c dim
         if return_all:
             return splatted_feat, mean_feat, one_hot, empty_mask_flag
         return splatted_feat
     def get_input(self, batch, k, bs=None, *args, **kwargs):
         z, c, x_orig, x_recon = super(ControlLDM, self).get_input(batch, self.first_stage_key, return_first_stage_outputs=True , *args, **kwargs)
         structure = batch['seg'].unsqueeze(1)
         mask =  batch['mask'].unsqueeze(1).to(self.device)
+        appearance_conc = self.get_appearance(self.appearance_net_conc, self.app_layer_conc, x_orig, mask)
+        appearance_ca = self.get_appearance(self.appearance_net_ca, self.app_layer_ca, x_orig, mask)
         if bs is not None:
             structure = structure[:bs]
         structure = structure.to(self.device)
         structure = structure.to(memory_format=torch.contiguous_format).float()
+        structure = torch.nn.functional.interpolate(structure, z.shape[2:])
+        mask = torch.nn.functional.interpolate(mask.float(), z.shape[2:])
+        def format_appearance(appearance):
+            if isinstance(appearance, list):
+                if bs is not None:
+                    appearance = [ap[:bs] for ap in appearance]
+                appearance = [ap.to(self.device) for ap in appearance]
+                appearance = [ap.to(memory_format=torch.contiguous_format).float() for ap in appearance]
+                appearance = [torch.nn.functional.interpolate(ap, z.shape[2:]) for ap in appearance]
+            else:
+                if bs is not None:
+                    appearance = appearance[:bs]
+                appearance = appearance.to(self.device)
+                appearance = appearance.to(memory_format=torch.contiguous_format).float()
+                appearance = torch.nn.functional.interpolate(appearance, z.shape[2:])
+            return appearance
+        appearance_conc = format_appearance(appearance_conc)
+        appearance_ca = format_appearance(appearance_ca)
+        if isinstance(appearance_conc, list):
+            concat_control = torch.cat(appearance_conc, dim=1)
+            concat_control = torch.cat([structure, concat_control, mask], dim=1)
+        else:
+            concat_control = torch.cat([structure, appearance_conc, mask], dim=1)
+        if isinstance(appearance_ca, list):
+            control = []
+            for ap in appearance_ca:
+                control.append(torch.cat([structure, ap, mask], dim=1))
+            control.append(concat_control)
+            return z, dict(c_crossattn=[c], c_concat=[control])
+        else:
+            control = torch.cat([structure, appearance_ca, mask], dim=1)
+            control.append(concat_control)
+            return z, dict(c_crossattn=[c], c_concat=[control])
     @torch.no_grad()
     def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
                    quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=False,
         log = dict()
         z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c["c_concat"][0], c["c_crossattn"][0]
         N = min(z.shape[0], N)
         n_row = min(z.shape[0], n_row)
         log["reconstruction"] = self.decode_first_stage(z)
+        log["control"] = batch['mask'].unsqueeze(1)
+        if 'aug_mask' in batch:
+            log['aug_mask'] = batch['aug_mask'].unsqueeze(1)
         log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
         if plot_diffusion_rows:
         if unconditional_guidance_scale > 1.0:
             uc_cross = self.get_unconditional_conditioning(N)
+            uc_cat = list(c_cat)  # torch.zeros_like(c_cat)
             uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
             samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
                                              batch_size=N, ddim=use_ddim,
             log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
         return log
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        if not self.sd_locked:
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt

cldm/controlnet.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import torch
+import torch as th
+import torch.nn as nn
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from ldm.util import exists
+torch.autograd.set_detect_anomaly(True)
+class Interpolate(nn.Module):
+    def __init__(self, mode):
+        super(Interpolate, self).__init__()
+        self.interp = torch.nn.functional.interpolate
+        self.mode = mode
+        self.factor = 8
+    def forward(self, x):
+        return x
+class ControlNetPAIR(nn.Module):
+    def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            hint_channels,
+            concat_indices,
+            num_res_blocks,
+            attention_resolutions,
+            concat_channels=130,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            mode='nearest',
+            conv_resample=True,
+            dims=2,
+            use_checkpoint=False,
+            use_fp16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False,  # custom transformer support
+            transformer_depth=1,  # custom transformer support
+            context_dim=None,  # custom transformer support
+            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None,
+            disable_middle_self_attn=False,
+            use_linear_in_transformer=False,
+            attn_class=['softmax', 'softmax', 'softmax', 'softmax'],
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.concat_indices = concat_indices
+        self.hint_channels = hint_channels
+        h_ch = sum([hint_channels[i] for i in concat_indices ])
+        self.input_hint_block =  TimestepEmbedSequential(
+                                    Interpolate('nearest'),
+                                    conv_nd(self.dims, concat_channels, self.model_channels, 3, padding=1),
+                                    nn.SiLU(),
+                                    zero_module(conv_nd(self.dims, self.model_channels, self.model_channels, 3, padding=1)))
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=mult * model_channels,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint, attn1_mode=attn_class[level], obj_feat_dim=hint_channels[level]
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                # hint_channels[-1],
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                use_checkpoint=use_checkpoint
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                # hint_channels[-1],
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        hint_list = []
+        concat_hint = hint[-1]
+        hint_c = hint[:-1]
+        if not isinstance(hint_c, list):
+            for _ in range(len(self.channel_mult)):
+                hint_list.append(hint_c)
+        else:
+            hint_list = hint_c
+            while len(hint_list) < 4:
+                hint_list.append(hint_c[-1])
+        mask = hint_c[0][:,-1].unsqueeze(1) #panoptic
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        guided_hint = self.input_hint_block(concat_hint, emb, context, x.shape)
+        outs = []
+        h = x.type(self.dtype)
+        cnt = self.num_res_blocks[0] + 1
+        i = 0
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                h = module(h, emb, context, hint_list[i], mask)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context, hint_list[i], mask)
+            outs.append(zero_conv(h, emb, context))
+            cnt -= 1
+            if cnt == 0:
+                if i<len(self.num_res_blocks):
+                    cnt = self.num_res_blocks[i] + 1
+            else:
+                if (i+1)<len(self.num_res_blocks):
+                    i += 1
+        h = self.middle_block(h, emb, context, hint_list[-1], mask)
+        outs.append(self.middle_block_out(h, emb, context))
+        return outs

cldm/ddim_hacked.py CHANGED Viewed

@@ -316,7 +316,6 @@ class DDIMSampler(object):
         return x_dec
 class DDIMSamplerSpaCFG(DDIMSampler):
     @torch.no_grad()
     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
@@ -332,8 +331,8 @@ class DDIMSamplerSpaCFG(DDIMSampler):
             model_uncond = self.model.apply_model(x, t, unconditional_conditioning[0])
             model_struct = self.model.apply_model(x, t, unconditional_conditioning[1])
             model_struct_app = self.model.apply_model(x, t, unconditional_conditioning[2])
-            sT, sS, sF = unconditional_guidance_scale
-            model_output = model_uncond + sS * (model_struct - model_uncond) + sF * (model_struct_app - model_struct) + sT * (model_t - model_struct_app)
         if self.model.parameterization == "v":
             e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)

         return x_dec
 class DDIMSamplerSpaCFG(DDIMSampler):
     @torch.no_grad()
     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
             model_uncond = self.model.apply_model(x, t, unconditional_conditioning[0])
             model_struct = self.model.apply_model(x, t, unconditional_conditioning[1])
             model_struct_app = self.model.apply_model(x, t, unconditional_conditioning[2])
+            sS, sF, sT = unconditional_guidance_scale
+            model_output = model_uncond + sS * (model_struct - model_uncond) + sF * (model_struct_app - model_struct) + sT * (model_t - model_uncond)
         if self.model.parameterization == "v":
             e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)

cldm/logger.py CHANGED Viewed

@@ -114,16 +114,16 @@ class SetupCallback(Callback):
             OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
                            os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
-        else:
-            # ModelCheckpoint callback created log directory --- remove it
-            if not self.resume and os.path.exists(self.logdir):
-                dst, name = os.path.split(self.logdir)
-                dst = os.path.join(dst, "child_runs", name)
-                os.makedirs(os.path.split(dst)[0], exist_ok=True)
-                try:
-                    os.rename(self.logdir, dst)
-                except FileNotFoundError:
-                    pass
 class ImageLogger(Callback):

             OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
                            os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
+        # else:
+        #     # ModelCheckpoint callback created log directory --- remove it
+        #     if not self.resume and os.path.exists(self.logdir):
+        #         dst, name = os.path.split(self.logdir)
+        #         dst = os.path.join(dst, "child_runs", name)
+        #         os.makedirs(os.path.split(dst)[0], exist_ok=True)
+        #         try:
+        #             os.rename(self.logdir, dst)
+        #         except FileNotFoundError:
+        #             pass
 class ImageLogger(Callback):

configs/{sap_fixed_hintnet_v15.yaml → pair_diff.yaml} RENAMED Viewed

@@ -1,9 +1,9 @@
 model:
-  target: cldm.cldm.SAP
   learning_rate: 1.5e-05
   sd_locked: True
   only_mid_control: False
-  init_ckpt: './models/sap_sd15_ini_fixed.ckpt'
   params:
     linear_start: 0.00085
     linear_end: 0.0120
@@ -21,14 +21,17 @@ model:
     scale_factor: 0.18215
     use_ema: False
     only_mid_control: False
     control_stage_config:
-      target: cldm.cldm.ControlNetSAP
       params:
-        input_hint_block: 'fixed'
         image_size: 32 # unused
         in_channels: 4
-        hint_channels: 129 #(128 + 1)
         model_channels: 320
         attention_resolutions: [ 4, 2, 1 ]
         num_res_blocks: 2
@@ -39,6 +42,7 @@ model:
         context_dim: 768
         use_checkpoint: True
         legacy: False
     unet_config:
       target: cldm.cldm.ControlledUnetModel
@@ -87,16 +91,25 @@ model:
 data:
   target: cldm.data.DataModuleFromConfig
   params:
-    batch_size: 4
     wrap: True
     train:
       target: dataset.txtseg.COCOTrain
       params:
         size: 512
     validation:
       target: dataset.txtseg.COCOValidation
       params:
         size: 512
 lightning:
@@ -111,4 +124,4 @@ lightning:
   trainer:
     benchmark: True
-    accumulate_grad_batches: 4

 model:
+  target: cldm.cldm.PAIRDiffusion
   learning_rate: 1.5e-05
   sd_locked: True
   only_mid_control: False
+  init_ckpt: './models/pair_diff_init.ckpt'
   params:
     linear_start: 0.00085
     linear_end: 0.0120
     scale_factor: 0.18215
     use_ema: False
     only_mid_control: False
+    appearance_net_locked: True
+    app_net: 'DINO'
     control_stage_config:
+      target: cldm.controlnet.ControlNetPAIR
       params:
         image_size: 32 # unused
         in_channels: 4
+        concat_indices: [0,1]
+        concat_channels: 130
+        hint_channels: [1026, 1026, -1, -1] #(1024 + 2)
         model_channels: 320
         attention_resolutions: [ 4, 2, 1 ]
         num_res_blocks: 2
         context_dim: 768
         use_checkpoint: True
         legacy: False
+        attn_class: ['maskguided', 'maskguided', 'softmax', 'softmax']
     unet_config:
       target: cldm.cldm.ControlledUnetModel
 data:
   target: cldm.data.DataModuleFromConfig
   params:
+    batch_size: 2
     wrap: True
+    num_workers: 4
     train:
       target: dataset.txtseg.COCOTrain
       params:
+        image_dir:
+        caption_file:
+        panoptic_mask_dir:
+        seg_dir:
         size: 512
     validation:
       target: dataset.txtseg.COCOValidation
       params:
         size: 512
+        image_dir:
+        caption_file:
+        panoptic_mask_dir:
+        seg_dir:
 lightning:
   trainer:
     benchmark: True
+    accumulate_grad_batches: 2

ldm/ldm/util.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import importlib
+import torch
+from torch import optim
+import numpy as np
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('font/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
+                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
+                 ema_power=1., param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
+                        ema_power=ema_power, param_names=param_names)
+        super().__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+            optim._functional.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=amsgrad,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    eps=group['eps'],
+                    maximize=False)
+            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
+            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
+        return loss

ldm/models/diffusion/ddim.py CHANGED Viewed

@@ -194,9 +194,19 @@ class DDIMSampler(object):
                 c_in = dict()
                 for k in c:
                     if isinstance(c[k], list):
-                        c_in[k] = [torch.cat([
-                            unconditional_conditioning[k][i],
-                            c[k][i]]) for i in range(len(c[k]))]
                     else:
                         c_in[k] = torch.cat([
                                 unconditional_conditioning[k],
@@ -333,4 +343,5 @@ class DDIMSampler(object):
                                           unconditional_guidance_scale=unconditional_guidance_scale,
                                           unconditional_conditioning=unconditional_conditioning)
             if callback: callback(i)
-        return x_dec

                 c_in = dict()
                 for k in c:
                     if isinstance(c[k], list):
+                        c_in[k] = []
+                        if isinstance(c[k][0], list):
+                            for i in range(len(c[k])):
+                                c_ = []
+                                for j in range(len(c[k][i])):
+                                    c_.append(torch.cat([
+                                        unconditional_conditioning[k][i][j],
+                                        c[k][i][j]]) )
+                                c_in[k].append(c_)
+                        else:
+                            c_in[k] = [torch.cat([
+                                unconditional_conditioning[k][i],
+                                c[k][i]]) for i in range(len(c[k]))]
                     else:
                         c_in[k] = torch.cat([
                                 unconditional_conditioning[k],
                                           unconditional_guidance_scale=unconditional_guidance_scale,
                                           unconditional_conditioning=unconditional_conditioning)
             if callback: callback(i)
+        return x_dec

ldm/modules/attention.py CHANGED Viewed

@@ -42,7 +42,7 @@ def init_(tensor):
     dim = tensor.shape[-1]
     std = 1 / math.sqrt(dim)
     tensor.uniform_(-std, std)
-    return tensor
 # feedforward
@@ -143,7 +143,7 @@ class SpatialSelfAttention(nn.Module):
 class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
@@ -160,7 +160,7 @@ class CrossAttention(nn.Module):
             nn.Dropout(dropout)
         )
-    def forward(self, x, context=None, mask=None):
         h = self.heads
         q = self.to_q(x)
@@ -194,6 +194,34 @@ class CrossAttention(nn.Module):
         return self.to_out(out)
 class MemoryEfficientCrossAttention(nn.Module):
     # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
@@ -246,17 +274,19 @@ class MemoryEfficientCrossAttention(nn.Module):
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
         "softmax": CrossAttention,  # vanilla attention
-        "softmax-xformers": MemoryEfficientCrossAttention
     }
     def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
-                 disable_self_attn=False):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
         attn_cls = self.ATTENTION_MODES[attn_mode]
         self.disable_self_attn = disable_self_attn
-        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
-                              context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
         self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
                               heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
@@ -265,11 +295,17 @@ class BasicTransformerBlock(nn.Module):
         self.norm3 = nn.LayerNorm(dim)
         self.checkpoint = checkpoint
-    def forward(self, x, context=None):
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
         x = self.attn2(self.norm2(x), context=context) + x
         x = self.ff(self.norm3(x)) + x
         return x
@@ -287,7 +323,7 @@ class SpatialTransformer(nn.Module):
     def __init__(self, in_channels, n_heads, d_head,
                  depth=1, dropout=0., context_dim=None,
                  disable_self_attn=False, use_linear=False,
-                 use_checkpoint=True):
         super().__init__()
         if exists(context_dim) and not isinstance(context_dim, list):
             context_dim = [context_dim]
@@ -305,7 +341,8 @@ class SpatialTransformer(nn.Module):
         self.transformer_blocks = nn.ModuleList(
             [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
-                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
                 for d in range(depth)]
         )
         if not use_linear:
@@ -318,11 +355,20 @@ class SpatialTransformer(nn.Module):
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
-    def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
         if not isinstance(context, list):
             context = [context]
         b, c, h, w = x.shape
         x_in = x
         x = self.norm(x)
         if not self.use_linear:
@@ -331,7 +377,7 @@ class SpatialTransformer(nn.Module):
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
-            x = block(x, context=context[i])
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()

     dim = tensor.shape[-1]
     std = 1 / math.sqrt(dim)
     tensor.uniform_(-std, std)
+    return tensor
 # feedforward
 class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., **kargs):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
             nn.Dropout(dropout)
         )
+    def forward(self, x, context=None, mask=None, **kargs):
         h = self.heads
         q = self.to_q(x)
         return self.to_out(out)
+class MaskGuidedSelfAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., obj_feat_dim=1024):
+        super().__init__()
+        #here context dim is for object features coming from image encoder
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.obj_feats_map = nn.Linear(obj_feat_dim, inner_dim)
+        self.to_v = nn.Linear(inner_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+        self.scale = dim_head ** -0.5
+    def forward(self, x, context=None, mask=None, obj_mask=None, obj_feat=None):
+        _, _, ht, wd = obj_feat.shape
+        obj_feat = rearrange(obj_feat, 'b c h w -> b (h w) c').contiguous()
+        obj_feat = self.obj_feats_map(obj_feat)
+        v = self.to_v(obj_feat)
+        return self.to_out(v)
 class MemoryEfficientCrossAttention(nn.Module):
     # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
         "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,
+        "maskguided": MaskGuidedSelfAttention
     }
     def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                 disable_self_attn=False, attn1_mode="softmax", obj_feat_dim=1024):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
         attn_cls = self.ATTENTION_MODES[attn_mode]
+        attn1_cls = self.ATTENTION_MODES[attn1_mode]
         self.disable_self_attn = disable_self_attn
+        self.attn1 = attn1_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+                              context_dim=context_dim if self.disable_self_attn else None, obj_feat_dim=obj_feat_dim)  # is a self-attention if not self.disable_self_attn
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
         self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
                               heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
         self.norm3 = nn.LayerNorm(dim)
         self.checkpoint = checkpoint
+        # self.ff_text_obj_feat = FeedForward(context_dim, dim_out=dim, mult=1, dropout=dropout, glu=gated_ff)
+    def forward(self, x, context=None, obj_mask=None, obj_feat=None):
+        if obj_mask is None:
+            # return self._forward(x, context, obj_mask, obj_feat)
+            return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, (x, context, obj_mask, obj_feat), self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None, obj_mask=None, obj_feat=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None,
+                       obj_mask=obj_mask, obj_feat=obj_feat) + x
         x = self.attn2(self.norm2(x), context=context) + x
         x = self.ff(self.norm3(x)) + x
         return x
     def __init__(self, in_channels, n_heads, d_head,
                  depth=1, dropout=0., context_dim=None,
                  disable_self_attn=False, use_linear=False,
+                 use_checkpoint=True,attn1_mode='softmax',obj_feat_dim=None):
         super().__init__()
         if exists(context_dim) and not isinstance(context_dim, list):
             context_dim = [context_dim]
         self.transformer_blocks = nn.ModuleList(
             [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
+                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint, attn1_mode=attn1_mode,
+                                   obj_feat_dim=obj_feat_dim)
                 for d in range(depth)]
         )
         if not use_linear:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
+    def forward(self, x, context=None, obj_masks=None, obj_feats=None):
         # note: if no context is given, cross-attention defaults to self-attention
         if not isinstance(context, list):
             context = [context]
+        if not isinstance(obj_masks, list):
+            obj_masks = [obj_masks]
+        if not isinstance(obj_feats, list):
+            obj_feats = [obj_feats]
         b, c, h, w = x.shape
+        if obj_feats[0] is not None:
+            obj_feats = [torch.nn.functional.interpolate(ofe, [h,w]) for ofe in obj_feats]
+            obj_masks = [torch.nn.functional.interpolate(om, [h,w]) for om in obj_masks]
         x_in = x
         x = self.norm(x)
         if not self.use_linear:
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context[i], obj_mask=obj_masks[i], obj_feat=obj_feats[i])
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()

ldm/modules/diffusionmodules/openaimodel.py CHANGED Viewed

@@ -69,19 +69,31 @@ class TimestepBlock(nn.Module):
         Apply the module to `x` given `emb` timestep embeddings.
         """
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     """
     A sequential module that passes timestep embeddings to the children that
     support it as an extra input.
     """
-    def forward(self, x, emb, context=None, *args):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context)
             else:
                 x = layer(x)
         return x
@@ -783,4 +795,4 @@ class UNetModel(nn.Module):
         if self.predict_codebook_ids:
             return self.id_predictor(h)
         else:
-            return self.out(h)

         Apply the module to `x` given `emb` timestep embeddings.
         """
+class TimestepBlockSpa(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb, obj_feat):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock, TimestepBlockSpa):
     """
     A sequential module that passes timestep embeddings to the children that
     support it as an extra input.
     """
+    def forward(self, x, emb, context=None, obj_feat=None,obj_masks=None, *args):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context, obj_masks=obj_masks, obj_feats=obj_feat)
+            elif isinstance(layer, TimestepBlockSpa):
+                x = layer(x, emb, obj_feat)
             else:
                 x = layer(x)
         return x
         if self.predict_codebook_ids:
             return self.id_predictor(h)
         else:
+            return self.out(h)

ldm/modules/diffusionmodules/util.py CHANGED Viewed

@@ -215,9 +215,10 @@ class SiLU(nn.Module):
 class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
         return super().forward(x.float()).type(x.dtype)
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.

 class GroupNorm32(nn.GroupNorm):
+    def forward(self, x, *args):
         return super().forward(x.float()).type(x.dtype)
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.

ldm/modules/encoders/modules.py CHANGED Viewed

@@ -114,14 +114,14 @@ class FrozenCLIPEmbedder(AbstractEncoder):
         for param in self.parameters():
             param.requires_grad = False
-    def forward(self, text):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
         outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
-        if self.layer == "last":
             z = outputs.last_hidden_state
-        elif self.layer == "pooled":
             z = outputs.pooler_output[:, None, :]
         else:
             z = outputs.hidden_states[self.layer_idx]

         for param in self.parameters():
             param.requires_grad = False
+    def forward(self, text, layer='last'):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
         outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
+        if layer == "last":
             z = outputs.last_hidden_state
+        elif layer == "pooled":
             z = outputs.pooler_output[:, None, :]
         else:
             z = outputs.hidden_states[self.layer_idx]

pair_diff_demo.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+import random
+import os
+import json
+import datetime
+from huggingface_hub import hf_hub_url, hf_hub_download
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.OneFormer import OneformerSegmenter
+from cldm.model import create_model, load_state_dict
+from cldm.ddim_hacked import DDIMSamplerSpaCFG
+from ldm.models.autoencoder import DiagonalGaussianDistribution
+SEGMENT_MODEL_DICT = {
+    'Oneformer': OneformerSegmenter,
+}
+MASK_MODEL_DICT = {
+    'Oneformer': OneformerSegmenter,
+}
+urls = {
+    'shi-labs/oneformer_coco_swin_large': ['150_16_swin_l_oneformer_coco_100ep.pth'],
+    'PAIR/PAIR-diffusion-sdv15-coco-finetune': ['model_e91.ckpt']
+}
+WTS_DICT = {
+}
+if os.path.exists('checkpoints') == False:
+    os.mkdir('checkpoints')
+for repo in urls:
+    files = urls[repo]
+    for file in files:
+        url = hf_hub_url(repo, file)
+        name_ckp = url.split('/')[-1]
+        WTS_DICT[repo] = hf_hub_download(repo_id=repo, filename=file)
+#main model
+model = create_model('configs/pair_diff.yaml').cpu()
+model.load_state_dict(load_state_dict(WTS_DICT['PAIR/PAIR-diffusion-sdv15-coco-finetune'], location='cuda'))
+save_dir = 'results/'
+model = model.cuda()
+ddim_sampler = DDIMSamplerSpaCFG(model)
+save_memory = False
+class ImageComp:
+    def __init__(self, edit_operation):
+        self.input_img = None
+        self.input_pmask = None
+        self.input_segmask = None
+        self.input_mask = None
+        self.input_points = []
+        self.input_scale = 1
+        self.ref_img = None
+        self.ref_pmask = None
+        self.ref_segmask = None
+        self.ref_mask = None
+        self.ref_points = []
+        self.ref_scale = 1
+        self.multi_modal = False
+        self.H = None
+        self.W = None
+        self.kernel = np.ones((5, 5), np.uint8)
+        self.edit_operation = edit_operation
+        self.init_segmentation_model()
+        os.makedirs(save_dir, exist_ok=True)
+        self.base_prompt = 'A picture of {}'
+    def init_segmentation_model(self, mask_model='Oneformer', segment_model='Oneformer'):
+        self.segment_model_name = segment_model
+        self.mask_model_name = mask_model
+        self.segment_model = SEGMENT_MODEL_DICT[segment_model](WTS_DICT['shi-labs/oneformer_coco_swin_large'])
+        if mask_model == 'Oneformer' and segment_model == 'Oneformer':
+            self.mask_model_inp = self.segment_model
+            self.mask_model_ref = self.segment_model
+        else:
+            self.mask_model_inp = MASK_MODEL_DICT[mask_model]()
+            self.mask_model_ref = MASK_MODEL_DICT[mask_model]()
+        print(f"Segmentation Models initialized with {mask_model} as mask and {segment_model} as segment")
+    def init_input_canvas(self, img):
+        img =  HWC3(img)
+        img = resize_image(img, 512)
+        if self.segment_model_name == 'Oneformer':
+            detected_seg = self.segment_model(img, 'semantic')
+        elif self.segment_model_name == 'SAM':
+            raise NotImplementedError
+        if self.mask_model_name == 'Oneformer':
+            detected_mask = self.mask_model_inp(img, 'panoptic')[0]
+        elif self.mask_model_name == 'SAM':
+            detected_mask = self.mask_model_inp(img)
+        self.input_points = []
+        self.input_img = img
+        self.input_pmask = detected_mask
+        self.input_segmask = detected_seg
+        self.H = img.shape[0]
+        self.W = img.shape[1]
+        return img
+    def init_ref_canvas(self, img):
+        img =  HWC3(img)
+        img = resize_image(img, 512)
+        if self.segment_model_name == 'Oneformer':
+            detected_seg = self.segment_model(img, 'semantic')
+        elif self.segment_model_name == 'SAM':
+            raise NotImplementedError
+        if self.mask_model_name == 'Oneformer':
+            detected_mask = self.mask_model_ref(img, 'panoptic')[0]
+        elif self.mask_model_name == 'SAM':
+            detected_mask = self.mask_model_ref(img)
+        self.ref_points = []
+        print("Initialized ref", img.shape)
+        self.ref_img = img
+        self.ref_pmask = detected_mask
+        self.ref_segmask = detected_seg
+        return img
+    def select_input_object(self, evt: gr.SelectData):
+        idx = list(np.array(evt.index) * self.input_scale)
+        self.input_points.append(idx)
+        if self.mask_model_name == 'Oneformer':
+            mask = self._get_mask_from_panoptic(np.array(self.input_points), self.input_pmask)
+        else:
+            mask = self.mask_model_inp(self.input_img, self.input_points)
+        c_ids = self.input_segmask[np.array(self.input_points)[:,1], np.array(self.input_points)[:,0]]
+        unique_ids, counts = torch.unique(c_ids, return_counts=True)
+        c_id = int(unique_ids[torch.argmax(counts)].cpu().detach().numpy())
+        category = self.segment_model.metadata.stuff_classes[c_id]
+        # print(self.segment_model.metadata.stuff_classes)
+        self.input_mask = mask
+        mask = mask.cpu().numpy()
+        output = mask[:,:,None] * self.input_img + (1 - mask[:,:,None]) * self.input_img * 0.2
+        return output.astype(np.uint8), self.base_prompt.format(category)
+    def select_ref_object(self, evt: gr.SelectData):
+        idx = list(np.array(evt.index) * self.ref_scale)
+        self.ref_points.append(idx)
+        if self.mask_model_name == 'Oneformer':
+            mask = self._get_mask_from_panoptic(np.array(self.ref_points), self.ref_pmask)
+        else:
+            mask = self.mask_model_ref(self.ref_img, self.ref_points)
+        c_ids = self.ref_segmask[np.array(self.ref_points)[:,1], np.array(self.ref_points)[:,0]]
+        unique_ids, counts = torch.unique(c_ids, return_counts=True)
+        c_id = int(unique_ids[torch.argmax(counts)].cpu().detach().numpy())
+        category = self.segment_model.metadata.stuff_classes[c_id]
+        print("Category of reference object is:", category)
+        self.ref_mask = mask
+        mask = mask.cpu().numpy()
+        output = mask[:,:,None] * self.ref_img + (1 - mask[:,:,None]) * self.ref_img * 0.2
+        return output.astype(np.uint8)
+    def clear_points(self):
+        self.input_points = []
+        self.ref_points = []
+        zeros_inp = np.zeros(self.input_img.shape)
+        zeros_ref = np.zeros(self.ref_img.shape)
+        return zeros_inp, zeros_ref
+    def return_input_img(self):
+        return self.input_img
+    def _get_mask_from_panoptic(self, points, panoptic_mask):
+        panoptic_mask_ = panoptic_mask + 1
+        ids = panoptic_mask_[points[:,1], points[:,0]]
+        unique_ids, counts = torch.unique(ids, return_counts=True)
+        mask_id = unique_ids[torch.argmax(counts)]
+        final_mask = torch.zeros(panoptic_mask.shape).cuda()
+        final_mask[panoptic_mask_ == mask_id] = 1
+        return final_mask
+    def _process_mask(self, mask, panoptic_mask, segmask):
+        obj_class = mask * (segmask + 1)
+        unique_ids, counts = torch.unique(obj_class, return_counts=True)
+        obj_class = unique_ids[torch.argmax(counts[1:]) + 1] - 1
+        return mask, obj_class
+    def _edit_app(self, whole_ref):
+        """
+            Manipulates the panoptic mask of input image to change appearance
+        """
+        input_pmask = self.input_pmask
+        input_segmask = self.input_segmask
+        if whole_ref:
+            reference_mask = torch.ones(self.ref_pmask.shape).cuda()
+        else:
+            reference_mask, _ = self._process_mask(self.ref_mask, self.ref_pmask, self.ref_segmask)
+        edit_mask, _ = self._process_mask(self.input_mask, self.input_pmask, self.input_segmask)
+        # tmp = cv2.dilate(edit_mask.squeeze().cpu().numpy(), self.kernel, iterations = 2)
+        # region_mask = torch.tensor(tmp).cuda()
+        region_mask = edit_mask
+        ma = torch.max(input_pmask)
+        input_pmask[edit_mask == 1] = ma + 1
+        return reference_mask, input_pmask, input_segmask, region_mask, ma
+    def _add_object(self, input_mask, dilation_fac):
+        """
+        Manipulates the panooptic mask of input image for adding objects
+        Args:
+            input_mask (numpy array): Region where new objects needs to be added
+            dilation factor (float): Controls edge merging region for adding objects
+        """
+        input_pmask = self.input_pmask
+        input_segmask = self.input_segmask
+        reference_mask, obj_class = self._process_mask(self.ref_mask, self.ref_pmask, self.ref_segmask)
+        tmp = cv2.dilate(input_mask['mask'][:, :, 0], self.kernel, iterations = int(dilation_fac))
+        region = torch.tensor(tmp)
+        region_mask = torch.zeros_like(region).cuda()
+        region_mask[region > 127] = 1
+        mask_ = torch.tensor(input_mask['mask'][:, :, 0])
+        edit_mask = torch.zeros_like(mask_).cuda()
+        edit_mask[mask_ > 127] = 1
+        ma = torch.max(input_pmask)
+        input_pmask[edit_mask == 1] = ma + 1
+        print(obj_class)
+        input_segmask[edit_mask == 1] = obj_class.long()
+        return reference_mask, input_pmask, input_segmask, region_mask, ma
+    def _edit(self, input_mask, ref_mask, dilation_fac=1, whole_ref=False, inter=1):
+        """
+        Entry point for all the appearance editing and add objects operations. The function manipulates the
+        appearance vectors and structure based on user input
+        Args:
+            input mask (numpy array): Region in input image which needs to be edited
+            dilation factor (float): Controls edge merging region for adding objects
+            whole_ref (bool): Flag for specifying if complete reference image should be used
+            inter (float): Interpolation of appearance between the reference appearance and the input appearance.
+        """
+        input_img = (self.input_img/127.5 - 1)
+        input_img =  torch.from_numpy(input_img.astype(np.float32)).cuda().unsqueeze(0).permute(0,3,1,2)
+        reference_img = (self.ref_img/127.5 - 1)
+        reference_img =  torch.from_numpy(reference_img.astype(np.float32)).cuda().unsqueeze(0).permute(0,3,1,2)
+        if self.edit_operation == 'add_obj':
+            reference_mask, input_pmask, input_segmask, region_mask, ma = self._add_object(input_mask, dilation_fac)
+        elif self.edit_operation == 'edit_app':
+            reference_mask, input_pmask, input_segmask, region_mask, ma = self._edit_app(whole_ref)
+        #concat featurees
+        input_pmask = input_pmask.float().cuda().unsqueeze(0).unsqueeze(1)
+        _, mean_feat_inpt_conc, one_hot_inpt_conc, _ = model.get_appearance(model.appearance_net_conc, model.app_layer_conc, input_img, input_pmask, return_all=True)
+        reference_mask = reference_mask.float().cuda().unsqueeze(0).unsqueeze(1)
+        _, mean_feat_ref_conc, _, _ = model.get_appearance(model.appearance_net_conc, model.app_layer_conc, reference_img, reference_mask, return_all=True)
+        # if mean_feat_ref.shape[1] > 1:
+        if isinstance(mean_feat_inpt_conc, list):
+            appearance_conc = []
+            for i in range(len(mean_feat_inpt_conc)):
+                mean_feat_inpt_conc[i][:, ma + 1] = (1 - inter) * mean_feat_inpt_conc[i][:, ma + 1] + inter*mean_feat_ref_conc[i][:, 1]
+                splatted_feat_conc = torch.einsum('nmc, nmhw->nchw', mean_feat_inpt_conc[i], one_hot_inpt_conc)
+                splatted_feat_conc = torch.nn.functional.normalize(splatted_feat_conc)
+                splatted_feat_conc = torch.nn.functional.interpolate(splatted_feat_conc, (self.H//8, self.W//8))
+                appearance_conc.append(splatted_feat_conc)
+            appearance_conc = torch.cat(appearance_conc, dim=1)
+        else:
+            print("manipulating")
+            mean_feat_inpt_conc[:, ma + 1] = (1 - inter) * mean_feat_inpt_conc[:, ma + 1] + inter*mean_feat_ref_conc[:, 1]
+            splatted_feat_conc = torch.einsum('nmc, nmhw->nchw', mean_feat_inpt_conc, one_hot_inpt_conc)
+            appearance_conc = torch.nn.functional.normalize(splatted_feat_conc) #l2 normaliz
+            appearance_conc = torch.nn.functional.interpolate(appearance_conc, (self.H//8, self.W//8))
+        #cross attention features
+        _, mean_feat_inpt_ca, one_hot_inpt_ca, _ = model.get_appearance(model.appearance_net_ca, model.app_layer_ca, input_img, input_pmask, return_all=True)
+        _, mean_feat_ref_ca, _, _ = model.get_appearance(model.appearance_net_ca, model.app_layer_ca, reference_img, reference_mask, return_all=True)
+        # if mean_feat_ref.shape[1] > 1:
+        if isinstance(mean_feat_inpt_ca, list):
+            appearance_ca = []
+            for i in range(len(mean_feat_inpt_ca)):
+                mean_feat_inpt_ca[i][:, ma + 1] = (1 - inter) * mean_feat_inpt_ca[i][:, ma + 1] + inter*mean_feat_ref_ca[i][:, 1]
+                splatted_feat_ca = torch.einsum('nmc, nmhw->nchw', mean_feat_inpt_ca[i], one_hot_inpt_ca)
+                splatted_feat_ca = torch.nn.functional.normalize(splatted_feat_ca)
+                splatted_feat_ca = torch.nn.functional.interpolate(splatted_feat_ca, (self.H//8, self.W//8))
+                appearance_ca.append(splatted_feat_ca)
+        else:
+            print("manipulating")
+            mean_feat_inpt_ca[:, ma + 1] = (1 - inter) * mean_feat_inpt_ca[:, ma + 1] + inter*mean_feat_ref_ca[:, 1]
+            splatted_feat_ca = torch.einsum('nmc, nmhw->nchw', mean_feat_inpt_ca, one_hot_inpt_ca)
+            appearance_ca = torch.nn.functional.normalize(splatted_feat_ca) #l2 normaliz
+            appearance_ca = torch.nn.functional.interpolate(appearance_ca, (self.H//8, self.W//8))
+        input_segmask =  ((input_segmask+1)/ 127.5 - 1.0).cuda().unsqueeze(0).unsqueeze(1)
+        structure = torch.nn.functional.interpolate(input_segmask, (self.H//8, self.W//8))
+        return structure, appearance_conc, appearance_ca, region_mask, input_img
+    def _edit_obj_var(self, input_mask, ignore_structure):
+        input_img = (self.input_img/127.5 - 1)
+        input_img =  torch.from_numpy(input_img.astype(np.float32)).cuda().unsqueeze(0).permute(0,3,1,2)
+        input_pmask = self.input_pmask
+        input_segmask = self.input_segmask
+        ma = torch.max(input_pmask)
+        mask_ = torch.tensor(input_mask['mask'][:, :, 0])
+        edit_mask = torch.zeros_like(mask_).cuda()
+        edit_mask[mask_ > 127] = 1
+        tmp = edit_mask * (input_pmask + ma + 1)
+        if ignore_structure:
+            tmp = edit_mask
+        input_pmask = tmp * edit_mask + (1 - edit_mask) * input_pmask
+        input_pmask = input_pmask.float().cuda().unsqueeze(0).unsqueeze(1)
+        mask_ca_feat = self.input_pmask.float().cuda().unsqueeze(0).unsqueeze(1) if ignore_structure else input_pmask
+        print(torch.unique(mask_ca_feat))
+        appearance_conc,_,_,_ = model.get_appearance(model.appearance_net_conc, model.app_layer_conc, input_img, input_pmask, return_all=True)
+        appearance_ca = model.get_appearance(model.appearance_net_ca, model.app_layer_ca,  input_img, mask_ca_feat)
+        appearance_conc = torch.nn.functional.interpolate(appearance_conc, (self.H//8, self.W//8))
+        appearance_ca = [torch.nn.functional.interpolate(ap, (self.H//8, self.W//8)) for ap in appearance_ca]
+        input_segmask =  ((input_segmask+1)/ 127.5 - 1.0).cuda().unsqueeze(0).unsqueeze(1)
+        structure = torch.nn.functional.interpolate(input_segmask, (self.H//8, self.W//8))
+        tmp = input_mask['mask'][:, :, 0]
+        region = torch.tensor(tmp)
+        mask = torch.zeros_like(region).cuda()
+        mask[region > 127] = 1
+        return structure, appearance_conc, appearance_ca, mask, input_img
+    def get_caption(self, mask):
+        """
+        Generates the captions based on a set template
+        Args:
+            mask (numpy array): Region of image based on which caption needs to be generated
+        """
+        mask = mask['mask'][:, :, 0]
+        region = torch.tensor(mask).cuda()
+        mask = torch.zeros_like(region)
+        mask[region > 127] = 1
+        if torch.sum(mask) == 0:
+            return ""
+        c_ids = self.input_segmask * mask
+        unique_ids, counts = torch.unique(c_ids, return_counts=True)
+        c_id = int(unique_ids[torch.argmax(counts[1:]) + 1].cpu().detach().numpy())
+        category = self.segment_model.metadata.stuff_classes[c_id]
+        return self.base_prompt.format(category)
+    def save_result(self, input_mask, prompt, a_prompt, n_prompt,
+                ddim_steps, scale_s, scale_f, scale_t, seed, dilation_fac=1,inter=1,
+                  free_form_obj_var=False, ignore_structure=False):
+        """
+        Saves the current results with all the meta data
+        """
+        meta_data = {}
+        meta_data['prompt'] = prompt
+        meta_data['a_prompt'] = a_prompt
+        meta_data['n_prompt'] = n_prompt
+        meta_data['seed'] = seed
+        meta_data['ddim_steps'] = ddim_steps
+        meta_data['scale_s'] = scale_s
+        meta_data['scale_f'] = scale_f
+        meta_data['scale_t'] = scale_t
+        meta_data['inter'] = inter
+        meta_data['dilation_fac'] = dilation_fac
+        meta_data['edit_operation'] = self.edit_operation
+        uuid = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+        os.makedirs(f'{save_dir}/{uuid}')
+        with open(f'{save_dir}/{uuid}/meta.json', "w") as outfile:
+                json.dump(meta_data, outfile)
+        cv2.imwrite(f'{save_dir}/{uuid}/input.png', self.input_img[:,:,::-1])
+        cv2.imwrite(f'{save_dir}/{uuid}/ref.png', self.ref_img[:,:,::-1])
+        if self.ref_mask is not None:
+            cv2.imwrite(f'{save_dir}/{uuid}/ref_mask.png', self.ref_mask.cpu().squeeze().numpy() * 200)
+        for i in range(len(self.results)):
+            cv2.imwrite(f'{save_dir}/{uuid}/edit{i}.png', self.results[i][:,:,::-1])
+        if self.edit_operation == 'add_obj' or free_form_obj_var:
+            cv2.imwrite(f'{save_dir}/{uuid}/input_mask.png', input_mask['mask'] * 200)
+        else:
+            cv2.imwrite(f'{save_dir}/{uuid}/input_mask.png', self.input_mask.cpu().squeeze().numpy() * 200)
+        print("Saved results at", f'{save_dir}/{uuid}')
+    def process(self, input_mask, ref_mask, prompt, a_prompt, n_prompt,
+                num_samples, ddim_steps, guess_mode, strength,
+                scale_s, scale_f, scale_t, seed, eta, dilation_fac=1,masking=True,whole_ref=False,inter=1,
+                  free_form_obj_var=False, ignore_structure=False):
+        print(prompt)
+        if free_form_obj_var:
+            print("Free form")
+            structure, appearance_conc, appearance_ca, mask, img = self._edit_obj_var(input_mask, ignore_structure)
+        else:
+            structure, appearance_conc, appearance_ca, mask, img = self._edit(input_mask, ref_mask, dilation_fac=dilation_fac,
+                                                     whole_ref=whole_ref, inter=inter)
+        input_pmask =  torch.nn.functional.interpolate(self.input_pmask.cuda().unsqueeze(0).unsqueeze(1).float(), (self.H//8, self.W//8))
+        input_pmask = input_pmask.to(memory_format=torch.contiguous_format)
+        if isinstance(appearance_ca, list):
+            null_appearance_ca = [torch.zeros(a.shape).cuda() for a in appearance_ca]
+            null_appearance_conc = torch.zeros(appearance_conc.shape).cuda()
+            null_structure = torch.zeros(structure.shape).cuda() - 1
+            null_control = [torch.cat([null_structure, napp, input_pmask * 0], dim=1) for napp in null_appearance_ca]
+            structure_control = [torch.cat([structure, napp, input_pmask], dim=1) for napp in null_appearance_ca]
+            full_control = [torch.cat([structure, napp, input_pmask], dim=1) for napp in appearance_ca]
+            null_control.append(torch.cat([null_structure, null_appearance_conc, null_structure * 0], dim=1))
+            structure_control.append(torch.cat([structure, null_appearance_conc, null_structure], dim=1))
+            full_control.append(torch.cat([structure, appearance_conc, input_pmask], dim=1))
+            null_control = [torch.cat([nc for _ in range(num_samples)], dim=0) for nc in null_control]
+            structure_control = [torch.cat([sc for _ in range(num_samples)], dim=0) for sc in structure_control]
+            full_control = [torch.cat([fc for _ in range(num_samples)], dim=0) for fc in full_control]
+        #Masking for local edit
+        if not masking:
+            mask, x0 = None, None
+        else:
+            x0 = model.encode_first_stage(img)
+            x0 = x0.sample() if isinstance(x0, DiagonalGaussianDistribution) else x0 # todo: check if we can set random number
+            x0 = x0 * model.scale_factor
+            mask = 1 - torch.tensor(mask).unsqueeze(0).unsqueeze(1).cuda()
+            mask = torch.nn.functional.interpolate(mask.float(), x0.shape[2:]).float()
+        if seed == -1:
+            seed = random.randint(0, 65535)
+        seed_everything(seed)
+        scale = [scale_s, scale_f, scale_t]
+        print(scale)
+        if save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        uc_cross = model.get_learned_conditioning([n_prompt] * num_samples)
+        c_cross = model.get_learned_conditioning([prompt + ', ' + a_prompt]  * num_samples)
+        cond = {"c_concat": [null_control], "c_crossattn": [c_cross]}
+        un_cond = {"c_concat": None if guess_mode else [null_control], "c_crossattn": [uc_cross]}
+        un_cond_struct = {"c_concat": None if guess_mode else [structure_control], "c_crossattn": [uc_cross]}
+        un_cond_struct_app = {"c_concat": None if guess_mode else [full_control], "c_crossattn": [uc_cross]}
+        shape = (4, self.H // 8, self.W // 8)
+        if save_memory:
+            model.low_vram_shift(is_diffusing=True)
+        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
+        samples, _ = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale, mask=mask, x0=x0,
+                                                     unconditional_conditioning=[un_cond, un_cond_struct, un_cond_struct_app ])
+        if save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        x_samples = (model.decode_first_stage(samples) + 1) * 127.5
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c')).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+        self.results = results
+        return [] + results

requirements.txt CHANGED Viewed

@@ -9,6 +9,7 @@ omegaconf==2.3.0
 open-clip-torch==2.0.2
 opencv-contrib-python==4.3.0.36
 opencv-python-headless==4.7.0.72
 prettytable==3.6.0
 pytorch-lightning==1.5.0
 safetensors==0.2.7
@@ -44,4 +45,4 @@ diffdist
 gdown
 huggingface_hub
 tqdm
-wget

 open-clip-torch==2.0.2
 opencv-contrib-python==4.3.0.36
 opencv-python-headless==4.7.0.72
+pillow==9.4.0
 prettytable==3.6.0
 pytorch-lightning==1.5.0
 safetensors==0.2.7
 gdown
 huggingface_hub
 tqdm
+wget