Spaces:

pawlo2013
/

anime_diffusion

Running

App Files Files Community

pawlo2013 commited on Nov 29, 2023

Commit

0b2b0ab

1 Parent(s): 27a598c

init commit

Browse files

Files changed (11) hide show

.gitattributes +1 -0
.gitignore +40 -0
app.py +136 -0
load_model.py +110 -0
models/structure/Advanced_Network_Helpers.py +255 -0
models/structure/Advanced_Network_Helpers_2.py +232 -0
models/structure/Advanced_Network_Helpers_3.py +232 -0
models/structure/Unet.py +152 -0
models/structure/Unet_2.py +152 -0
models/structure/Unet_3.py +166 -0
models/structure/hf_compatible_model.py +192 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.st filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+#.model.pth
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# vim swp files
+*.swp
+# caffe/pytorch model files
+*.pth
+*.pt
+# json
+*.json
+*.bin
+*.st
+.models/model-epoch_80.st
+.history/
+dataset/
+wandb/
+.vscode/
+https://github.com/higumax/sketchKeras-pytorch.git
+.startup.sh
+startup.sh

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+from load_model import sample
+import torch
+import random
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "mps" if torch.backends.mps.is_available() else device
+image_size = 128
+upscale = False
+clicked = False
+transform = transforms.Compose(
+    [
+        transforms.Resize((image_size, image_size)),
+        transforms.ToTensor(),
+        transforms.Lambda(lambda t: (t * 2) - 1),
+    ]
+)
+def make_scribbles(sketch, scribbles):
+    # get the value that occurs most often in the scribbles
+    sketch = transforms.Resize((image_size, image_size))(sketch)
+    scribbles = transforms.Resize((image_size, image_size))(scribbles)
+    grey_tensor = torch.tensor(0.49803922, device=device)
+    grey_tensor = grey_tensor.expand(3, image_size, image_size)
+    sketch = transforms.ToTensor()(sketch).to(device)
+    scribbles = transforms.ToTensor()(scribbles).to(device)
+    scribble_where_grey_mask = torch.eq(scribbles, grey_tensor)
+    merged = torch.where(scribble_where_grey_mask, sketch, scribbles)
+    return transforms.Lambda(lambda t: (t * 2) - 1)(sketch), transforms.Lambda(
+        lambda t: (t * 2) - 1
+    )(merged)
+def process_images(sketch, scribbles, sampling_steps, is_scribbles, seed_nr, upscale):
+    global clicked
+    clicked = True
+    w, h = sketch.size
+    if is_scribbles:
+        sketch, scribbles = make_scribbles(sketch, scribbles)
+    else:
+        sketch = transform(sketch.convert("RGB"))
+        scribbles = transform(scribbles.convert("RGB"))
+    if upscale:
+        output = transforms.Resize((h, w))(
+            sample(sketch, scribbles, sampling_steps, seed_nr)
+        )
+        clicked = False
+        return output
+    else:
+        output = sample(sketch, scribbles, sampling_steps, seed_nr)
+        clicked = False
+        return output
+theme = gr.themes.Monochrome()
+with gr.Blocks(theme=theme) as demo:
+    with gr.Row():
+        gr.Markdown(
+            "<h1 style='text-align: center; font-size: 30px;'>Image Inpainting with Conditional Diffusion by MedicAI</h1>"
+        )
+    with gr.Row():
+        with gr.Column():
+            sketch_input = gr.Image(type="pil", label="Sketch", height=500)
+        with gr.Column():
+            scribbles_input = gr.Image(type="pil", label="Scribbles", height=500)
+            info = gr.Markdown(
+                "<p style='text-align: center; font-size: 12px;'>"
+                "By default the scribbles are assumed to be merged with the sketch, if they appear on a grey background check the box below. "
+                "</p>"
+            )
+            is_scribbles = gr.Checkbox(label="Is Scribbles", value=False)
+        with gr.Column():
+            output = gr.Image(type="pil", label="Output")
+            upscale_info = gr.Markdown(
+                "<p style='text-align: center; font-size: 12px;'>"
+                f"If you want to stretch the downloadable output, check the box below, the default output of neural networks is {image_size}x{image_size} "
+                "</p>"
+            )
+            upscale_button = gr.Checkbox(label="Stretch", value=False)
+    with gr.Row():
+        with gr.Column():
+            seed_slider = gr.Number(
+                label="Random Seed 🎲",
+                value=random.randint(
+                    1,
+                    1000,
+                ),
+            )
+        with gr.Column():
+            sampling_slider = gr.Slider(
+                minimum=1,
+                maximum=250,
+                step=1,
+                label="DDPM Sampling Steps 🔄",
+                value=50,
+            )
+    with gr.Row():
+        generate_button = gr.Button(value="Generate", interactive=not clicked)
+    generate_button.click(
+        process_images,
+        inputs=[
+            sketch_input,
+            scribbles_input,
+            sampling_slider,
+            is_scribbles,
+            seed_slider,
+            upscale_button,
+        ],
+        outputs=output,
+        show_progress=True,
+    )
+demo.launch(server_port=3000, max_threads=1)

load_model.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from models.structure.Unet_3 import Unet
+from diffusers import DDPMScheduler
+import torch
+import os
+import glob
+from tqdm import tqdm
+from torchvision import transforms
+import pathlib
+from torchvision.utils import save_image
+from safetensors.torch import load_model, save_model
+denoising_timesteps = 4000
+image_size = 128
+channels = 3
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "mps" if torch.backends.mps.is_available() else device
+model = Unet(
+    dim=image_size,
+    channels=channels,
+    dim_mults=(1, 2, 4, 8),
+    use_convnext=False,
+).to(device)
+results_folder = pathlib.Path("models")
+checkpoint_files_st = glob.glob(str(results_folder / "model-epoch_*.st"))
+checkpoint_files_pt = glob.glob(str(results_folder / "model-epoch_*.pt"))
+if checkpoint_files_st:
+    # Sort the list of matching files by modification time (newest first)
+    checkpoint_files_st.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    # Select the newest file
+    checkpoint_files = checkpoint_files_st[0]
+    # Now, newest_model_file contains the path to the newest "model" file
+    load_model(model, checkpoint_files)
+    model.eval()
+    print("Loaded model from checkpoint", checkpoint_files)
+elif checkpoint_files_pt:
+    # Sort the list of matching files by modification time (newest first)
+    checkpoint_files_pt.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    # Select the newest file
+    checkpoint_files = checkpoint_files_pt[0]
+    # Now, newest_model_file contains the path to the newest "model" file
+    checkpoint = torch.load(checkpoint_files, map_location=device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    epoch = checkpoint["epoch"]
+    model.eval()
+    print("Loaded model from checkpoint", checkpoint_files)
+    if not pathlib.Path(str(results_folder / "model-epoch_*.st")).exists():
+        save_model(model, results_folder / "model-epoch_{}.st".format(epoch))
+        print("Saved model as a safetensor", results_folder)
+else:
+    raise Exception("No model files found in the folder.")
+def sample(sketch, scribbles, sampling_steps, seed_nr):
+    torch.manual_seed(seed_nr)
+    noise_scheduler = DDPMScheduler(
+        num_train_timesteps=denoising_timesteps, beta_schedule="squaredcos_cap_v2"
+    )
+    noise_scheduler.set_timesteps(sampling_steps, device=device)
+    sketch = sketch.to(device)
+    scribbles = scribbles.to(device)
+    sketch = sketch.unsqueeze(0)
+    scribbles = scribbles.unsqueeze(0)
+    with torch.no_grad():
+        b = sketch.shape[0]
+        noise_for_plain = torch.randn_like(sketch, device=device)
+        for i, t in tqdm(
+            enumerate(noise_scheduler.timesteps),
+            total=len(noise_scheduler.timesteps),
+        ):
+            noise_for_plain = noise_scheduler.scale_model_input(noise_for_plain, t).to(
+                device
+            )
+            time = t.expand(
+                b,
+            ).to(device)
+            plain_noise_pred = model(
+                x=noise_for_plain,
+                time=time,
+                implicit_conditioning=scribbles,
+                explicit_conditioning=sketch,
+            )
+            noise_for_plain = noise_scheduler.step(
+                plain_noise_pred,
+                t.long(),
+                noise_for_plain,
+            ).prev_sample
+    sample = torch.clamp((noise_for_plain / 2) + 0.5, 0, 1)
+    return transforms.ToPILImage()(sample[0].cpu())

models/structure/Advanced_Network_Helpers.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+def Upsample(dim):
+    return nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+def Downsample(dim):
+    return nn.Conv2d(dim, dim, 4, 2, 1)
+class SinusoidalPositionEmbeddings(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, time):
+        device = time.device
+        half_dim = self.dim // 2
+        embeddings = math.log(10000) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
+        embeddings = time[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        return embeddings
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.proj = nn.Conv2d(dim, dim_out, 3, padding=1)
+        self.norm = nn.GroupNorm(groups, dim_out)
+        self.act = nn.SiLU()
+    def forward(self, x, scale_shift=None):
+        x = self.proj(x)
+        x = self.norm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.act(x)
+        return x
+class ResnetBlock(nn.Module):
+    """https://arxiv.org/abs/1512.03385"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, groups=8):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.SiLU(), nn.Linear(time_emb_dim, dim_out))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.block1(x)
+        if exists(self.mlp) and exists(time_emb):
+            time_emb = self.mlp(time_emb)
+            h = rearrange(time_emb, "b c -> b c 1 1") + h
+        h = self.block2(h)
+        return h + self.res_conv(x)
+class ConvNextBlock(nn.Module):
+    """https://arxiv.org/abs/2201.03545"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, mult=2, norm=True):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.GELU(), nn.Linear(time_emb_dim, dim))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.ds_conv = nn.Conv2d(dim, dim, 7, padding=3, groups=dim)
+        self.net = nn.Sequential(
+            nn.GroupNorm(1, dim) if norm else nn.Identity(),
+            nn.Conv2d(dim, dim_out * mult, 3, padding=1),
+            nn.GELU(),
+            nn.GroupNorm(1, dim_out * mult),
+            nn.Conv2d(dim_out * mult, dim_out, 3, padding=1),
+        )
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.ds_conv(x)
+        if exists(self.mlp) and exists(time_emb):
+            assert exists(time_emb), "time embedding must be passed in"
+            condition = self.mlp(time_emb)
+            h = h + rearrange(condition, "b c -> b c 1 1")
+        h = self.net(h)
+        return h + self.res_conv(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x, cross_attend=None):
+        b, c, h, w = x.shape
+        if cross_attend is not None:
+            assert cross_attend.shape == x.shape
+            q_att = self.to_q(x)
+            k_att = self.to_k(cross_attend)
+            v_att = self.to_v(cross_attend)
+            q = rearrange(q_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+            k = rearrange(k_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+            v = rearrange(v_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+        else:
+            qkv = self.to_qkv(x).chunk(3, dim=1)
+            q, k, v = map(
+                lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+            )
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.to_out(out)
+class LinearCrossAttention(nn.Module):
+    def __init__(self, dim, heads=12, dim_head=128) -> None:
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_kv = nn.Conv2d(dim, hidden_dim * 2, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x, cross_attend):
+        b, c, h, w = x.shape
+        q = self.to_q(x)
+        k, v = self.to_kv(cross_attend).chunk(2, dim=1)
+        q = rearrange(q, "b (h c) x y -> b h c (x y)", h=self.heads)
+        k = rearrange(k, "b (h c) x y -> b h c (x y)", h=self.heads)
+        v = rearrange(v, "b (h c) x y -> b h c (x y)", h=self.heads)
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.out(out)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), nn.GroupNorm(1, dim))
+    def forward(self, x, cross_attend=None):
+        b, c, h, w = x.shape
+        if cross_attend is not None:
+            assert (
+                cross_attend.shape == x.shape
+            ), f"cross_attend must be same shape as x is {cross_attend.shape} and x is {x.shape}"
+            q_att = self.to_q(x)
+            k_att = self.to_k(cross_attend)
+            v_att = self.to_v(cross_attend)
+            q = rearrange(q_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+            k = rearrange(k_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+            v = rearrange(v_att, "b (h c) x y -> b h c (x y)", h=self.heads)
+        else:
+            qkv = self.to_qkv(x).chunk(3, dim=1)
+            q, k, v = map(
+                lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+            )
+        # calculate the softmax with respect to columns softmax of equivalent to q^T with respect to last dim
+        q = q.softmax(dim=-2)
+        # calculate the softmax with respect to rows of k
+        k = k.softmax(dim=-1)
+        # normalize the values in the attention matrix
+        q = q * self.scale
+        # dot product of q and v matrices
+        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)
+        # dot product of context and q
+        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
+        # rearrange the output to match the pytorch convention
+        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
+        return self.to_out(out)
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.GroupNorm(1, dim)
+    def forward(self, x, *args, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, *args, **kwargs)

models/structure/Advanced_Network_Helpers_2.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+def Upsample(dim):
+    return nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+def Downsample(dim):
+    return nn.Conv2d(dim, dim, 4, 2, 1)
+class SinusoidalPositionEmbeddings(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, time):
+        device = time.device
+        half_dim = self.dim // 2
+        embeddings = math.log(10000) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
+        embeddings = time[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        return embeddings
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.proj = nn.Conv2d(dim, dim_out, 3, padding=1)
+        self.norm = nn.GroupNorm(groups, dim_out)
+        self.act = nn.SiLU()
+    def forward(self, x, scale_shift=None):
+        x = self.proj(x)
+        x = self.norm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.act(x)
+        return x
+class ResnetBlock(nn.Module):
+    """https://arxiv.org/abs/1512.03385"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, groups=8):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.SiLU(), nn.Linear(time_emb_dim, dim_out))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.block1(x)
+        if exists(self.mlp) and exists(time_emb):
+            time_emb = self.mlp(time_emb)
+            h = rearrange(time_emb, "b c -> b c 1 1") + h
+        h = self.block2(h)
+        return h + self.res_conv(x)
+class ConvNextBlock(nn.Module):
+    """https://arxiv.org/abs/2201.03545"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, mult=2, norm=True):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.GELU(), nn.Linear(time_emb_dim, dim))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.ds_conv = nn.Conv2d(dim, dim, 7, padding=3, groups=dim)
+        self.net = nn.Sequential(
+            nn.GroupNorm(1, dim) if norm else nn.Identity(),
+            nn.Conv2d(dim, dim_out * mult, 3, padding=1),
+            nn.GELU(),
+            nn.GroupNorm(1, dim_out * mult),
+            nn.Conv2d(dim_out * mult, dim_out, 3, padding=1),
+        )
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.ds_conv(x)
+        if exists(self.mlp) and exists(time_emb):
+            assert exists(time_emb), "time embedding must be passed in"
+            condition = self.mlp(time_emb)
+            h = h + rearrange(condition, "b c -> b c 1 1")
+        h = self.net(h)
+        return h + self.res_conv(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.to_out(out)
+class LinearCrossAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32) -> None:
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_kv = nn.Conv2d(dim, hidden_dim * 2, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x, cross_attend):
+        b, c, h, w = x.shape
+        q = self.to_q(x)
+        k, v = self.to_kv(cross_attend).chunk(2, dim=1)
+        q = rearrange(q, "b (h c) x y -> b h c (x y)", h=self.heads)
+        k = rearrange(k, "b (h c) x y -> b h c (x y)", h=self.heads)
+        v = rearrange(v, "b (h c) x y -> b h c (x y)", h=self.heads)
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.out(out)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), nn.GroupNorm(1, dim))
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+        # calculate the softmax with respect to columns softmax of equivalent to q^T with respect to last dim
+        q = q.softmax(dim=-2)
+        # calculate the softmax with respect to rows of k
+        k = k.softmax(dim=-1)
+        # normalize the values in the attention matrix
+        q = q * self.scale
+        # dot product of q and v matrices
+        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)
+        # dot product of context and q
+        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
+        # rearrange the output to match the pytorch convention
+        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
+        return self.to_out(out)
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.GroupNorm(1, dim)
+    def forward(self, x, *args, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, *args, **kwargs)

models/structure/Advanced_Network_Helpers_3.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+def Upsample(dim):
+    return nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+def Downsample(dim):
+    return nn.Conv2d(dim, dim, 4, 2, 1)
+class SinusoidalPositionEmbeddings(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, time):
+        device = time.device
+        half_dim = self.dim // 2
+        embeddings = math.log(10000) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
+        embeddings = time[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        return embeddings
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.proj = nn.Conv2d(dim, dim_out, 3, padding=1)
+        self.norm = nn.GroupNorm(groups, dim_out)
+        self.act = nn.SiLU()
+    def forward(self, x, scale_shift=None):
+        x = self.proj(x)
+        x = self.norm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.act(x)
+        return x
+class ResnetBlock(nn.Module):
+    """https://arxiv.org/abs/1512.03385"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, groups=8):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.SiLU(), nn.Linear(time_emb_dim, dim_out))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.block1(x)
+        if exists(self.mlp) and exists(time_emb):
+            time_emb = self.mlp(time_emb)
+            h = rearrange(time_emb, "b c -> b c 1 1") + h
+        h = self.block2(h)
+        return h + self.res_conv(x)
+class ConvNextBlock(nn.Module):
+    """https://arxiv.org/abs/2201.03545"""
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, mult=2, norm=True):
+        super().__init__()
+        self.mlp = (
+            nn.Sequential(nn.GELU(), nn.Linear(time_emb_dim, dim))
+            if exists(time_emb_dim)
+            else None
+        )
+        self.ds_conv = nn.Conv2d(dim, dim, 7, padding=3, groups=dim)
+        self.net = nn.Sequential(
+            nn.GroupNorm(1, dim) if norm else nn.Identity(),
+            nn.Conv2d(dim, dim_out * mult, 3, padding=1),
+            nn.GELU(),
+            nn.GroupNorm(1, dim_out * mult),
+            nn.Conv2d(dim_out * mult, dim_out, 3, padding=1),
+        )
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb=None):
+        h = self.ds_conv(x)
+        if exists(self.mlp) and exists(time_emb):
+            assert exists(time_emb), "time embedding must be passed in"
+            condition = self.mlp(time_emb)
+            h = h + rearrange(condition, "b c -> b c 1 1")
+        h = self.net(h)
+        return h + self.res_conv(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.to_out(out)
+class LinearCrossAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32) -> None:
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_kv = nn.Conv2d(dim, hidden_dim * 2, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x, cross_attend):
+        b, c, h, w = x.shape
+        q = self.to_q(x)
+        k, v = self.to_kv(cross_attend).chunk(2, dim=1)
+        q = rearrange(q, "b (h c) x y -> b h c (x y)", h=self.heads)
+        k = rearrange(k, "b (h c) x y -> b h c (x y)", h=self.heads)
+        v = rearrange(v, "b (h c) x y -> b h c (x y)", h=self.heads)
+        q = q * self.scale
+        sim = einsum("b h d i, b h d j -> b h i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h i j, b h d j -> b h i d", attn, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.out(out)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_q = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_k = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_v = nn.Conv2d(dim, hidden_dim, 1, bias=False)
+        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), nn.GroupNorm(1, dim))
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+        # calculate the softmax with respect to columns softmax of equivalent to q^T with respect to last dim
+        q = q.softmax(dim=-2)
+        # calculate the softmax with respect to rows of k
+        k = k.softmax(dim=-1)
+        # normalize the values in the attention matrix
+        q = q * self.scale
+        # dot product of q and v matrices
+        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)
+        # dot product of context and q
+        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
+        # rearrange the output to match the pytorch convention
+        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
+        return self.to_out(out)
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.GroupNorm(1, dim)
+    def forward(self, x, *args, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, *args, **kwargs)

models/structure/Unet.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from .Advanced_Network_Helpers import *
+class Unet(nn.Module):
+    def __init__(
+        self,
+        dim,
+        init_dim=None,
+        out_dim=None,
+        dim_mults=(1, 2, 4, 8),
+        channels=3,
+        with_time_emb=True,
+        resnet_block_groups=8,
+        use_convnext=True,
+        convnext_mult=2,
+    ):
+        super().__init__()
+        # determine dimensions
+        self.channels = channels  # since we are concatenating the images and the conditionings along the channel dimension
+        init_dim = default(init_dim, dim // 3 * 2)
+        self.init_conv = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        self.conditioning_init = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.in_out = in_out
+        if use_convnext:
+            block_klass = partial(ConvNextBlock, mult=convnext_mult)
+        else:
+            block_klass = partial(ResnetBlock, groups=resnet_block_groups)
+        # time embeddings
+        if with_time_emb:
+            time_dim = dim * 4
+            self.time_mlp = nn.Sequential(
+                SinusoidalPositionEmbeddings(dim),
+                nn.Linear(dim, time_dim),
+                nn.GELU(),
+                nn.Linear(time_dim, time_dim),
+            )
+        else:
+            time_dim = None
+            self.time_mlp = None
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        self.conditioning_encoder = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        self.num_resolutions = num_resolutions
+        # conditioning encoder
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.conditioning_encoder.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out, time_emb_dim=time_dim),
+                        block_klass(dim_out, dim_out, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        mid_dim = dims[-1]
+        self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        self.cross_attention = Residual(PreNorm(mid_dim, LinearCrossAttention(mid_dim)))
+        self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (num_resolutions - 1)
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_out * 2, dim_in, time_emb_dim=time_dim),
+                        block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+                        Upsample(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        out_dim = default(out_dim, channels)
+        self.final_conv = nn.Sequential(
+            block_klass(dim, dim), nn.Conv2d(dim, out_dim, 1)
+        )
+    def forward(self, x, time, implicit_conditioning, explicit_conditioning):
+        x = torch.cat((x, explicit_conditioning), dim=1)
+        conditioning = torch.cat((implicit_conditioning, explicit_conditioning), dim=1)
+        x = self.init_conv(x)
+        conditioning = self.conditioning_init(conditioning)
+        t = self.time_mlp(time) if exists(self.time_mlp) else None
+        h = []
+        # conditioning encoder
+        for block1, attn, downsample in self.conditioning_encoder:
+            conditioning = block1(conditioning)
+            conditioning = attn(conditioning)
+            conditioning = downsample(conditioning)
+        for block1, block2, attn, downsample in self.downs:
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            h.append(x)
+            x = downsample(x)
+        # bottleneck
+        x = self.mid_block1(x, t)
+        x = self.cross_attention(x, conditioning)
+        x = self.mid_block2(x, t)
+        for block1, block2, attn, upsample in self.ups:
+            x = torch.cat((x, h.pop()), dim=1)
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            x = upsample(x)
+        return self.final_conv(x)

models/structure/Unet_2.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from .Advanced_Network_Helpers_2 import *
+class Unet(nn.Module):
+    def __init__(
+        self,
+        dim,
+        init_dim=None,
+        out_dim=None,
+        dim_mults=(1, 2, 4, 8),
+        channels=3,
+        with_time_emb=True,
+        resnet_block_groups=8,
+        use_convnext=True,
+        convnext_mult=2,
+    ):
+        super().__init__()
+        # determine dimensions
+        self.channels = channels  # since we are concatenating the images and the conditionings along the channel dimension
+        init_dim = default(init_dim, dim // 3 * 2)
+        self.init_conv = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        self.conditioning_init = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.in_out = in_out
+        if use_convnext:
+            block_klass = partial(ConvNextBlock, mult=convnext_mult)
+        else:
+            block_klass = partial(ResnetBlock, groups=resnet_block_groups)
+        # time embeddings
+        if with_time_emb:
+            time_dim = dim * 4
+            self.time_mlp = nn.Sequential(
+                SinusoidalPositionEmbeddings(dim),
+                nn.Linear(dim, time_dim),
+                nn.GELU(),
+                nn.Linear(time_dim, time_dim),
+            )
+        else:
+            time_dim = None
+            self.time_mlp = None
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        self.conditioning_encoder = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        self.num_resolutions = num_resolutions
+        # conditioning encoder
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.conditioning_encoder.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out, time_emb_dim=time_dim),
+                        block_klass(dim_out, dim_out, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        mid_dim = dims[-1]
+        self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        self.cross_attention = Residual(PreNorm(mid_dim, LinearCrossAttention(mid_dim)))
+        self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (num_resolutions - 1)
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_out * 2, dim_in, time_emb_dim=time_dim),
+                        block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+                        Upsample(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        out_dim = default(out_dim, channels)
+        self.final_conv = nn.Sequential(
+            block_klass(dim, dim), nn.Conv2d(dim, out_dim, 1)
+        )
+    def forward(self, x, time, implicit_conditioning, explicit_conditioning):
+        x = torch.cat((x, explicit_conditioning), dim=1)
+        conditioning = torch.cat((implicit_conditioning, explicit_conditioning), dim=1)
+        x = self.init_conv(x)
+        conditioning = self.conditioning_init(conditioning)
+        t = self.time_mlp(time) if exists(self.time_mlp) else None
+        h = []
+        # conditioning encoder
+        for block1, attn, downsample in self.conditioning_encoder:
+            conditioning = block1(conditioning)
+            conditioning = attn(conditioning)
+            conditioning = downsample(conditioning)
+        for block1, block2, attn, downsample in self.downs:
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            h.append(x)
+            x = downsample(x)
+        # bottleneck
+        x = self.mid_block1(x, t)
+        x = self.cross_attention(x, conditioning)
+        x = self.mid_block2(x, t)
+        for block1, block2, attn, upsample in self.ups:
+            x = torch.cat((x, h.pop()), dim=1)
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            x = upsample(x)
+        return self.final_conv(x)

models/structure/Unet_3.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from .Advanced_Network_Helpers_3 import *
+from transformers import PreTrainedModel
+class Unet(nn.Module):
+    def __init__(
+        self,
+        dim,
+        init_dim=None,
+        out_dim=None,
+        dim_mults=(1, 2, 4, 8),
+        channels=3,
+        with_time_emb=True,
+        resnet_block_groups=8,
+        use_convnext=True,
+        convnext_mult=2,
+    ):
+        super().__init__()
+        # determine dimensions
+        self.channels = channels  # since we are concatenating the images and the conditionings along the channel dimension
+        init_dim = default(init_dim, dim // 3 * 2)
+        self.init_conv = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        self.conditioning_init = nn.Conv2d(self.channels, init_dim, 7, padding=3)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.in_out = in_out
+        if use_convnext:
+            block_klass = partial(ConvNextBlock, mult=convnext_mult)
+        else:
+            block_klass = partial(ResnetBlock, groups=resnet_block_groups)
+        # time embeddings
+        if with_time_emb:
+            time_dim = dim * 4
+            self.time_mlp = nn.Sequential(
+                SinusoidalPositionEmbeddings(dim),
+                nn.Linear(dim, time_dim),
+                nn.GELU(),
+                nn.Linear(time_dim, time_dim),
+            )
+        else:
+            time_dim = None
+            self.time_mlp = None
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        self.conditioning_encoder = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        self.num_resolutions = num_resolutions
+        # conditioning encoder
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.conditioning_encoder.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out, time_emb_dim=time_dim),
+                        block_klass(dim_out, dim_out, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        mid_dim = dims[-1]
+        self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        self.cross_attention_1 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.cross_attention_2 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.cross_attention_3 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (num_resolutions - 1)
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_out * 2, dim_in, time_emb_dim=time_dim),
+                        block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+                        Upsample(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        out_dim = default(out_dim, channels)
+        self.final_conv = nn.Sequential(
+            block_klass(dim, dim), nn.Conv2d(dim, out_dim, 1)
+        )
+    def forward(self, x, time, implicit_conditioning, explicit_conditioning):
+        x = torch.cat((x, explicit_conditioning), dim=1)
+        x = self.init_conv(x)
+        conditioning = self.conditioning_init(implicit_conditioning)
+        t = self.time_mlp(time) if exists(self.time_mlp) else None
+        h = []
+        # conditioning encoder
+        for block1, attn, downsample in self.conditioning_encoder:
+            conditioning = block1(conditioning)
+            conditioning = attn(conditioning)
+            conditioning = downsample(conditioning)
+        for block1, block2, attn, downsample in self.downs:
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            h.append(x)
+            x = downsample(x)
+        # reverse the c list
+        # bottleneck
+        x = self.cross_attention_1(x, conditioning)
+        x = self.mid_block1(x, t)
+        x = self.cross_attention_2(x, conditioning)
+        x = self.mid_block2(x, t)
+        x = self.cross_attention_3(x, conditioning)
+        for block1, block2, attn, upsample in self.ups:
+            x = torch.cat((x, h.pop()), dim=1)
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            x = upsample(x)
+        return self.final_conv(x)

models/structure/hf_compatible_model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from transformers import PretrainedConfig, PreTrainedModel
+import math
+from inspect import isfunction
+from functools import partial
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from einops import rearrange
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from .Advanced_Network_Helpers_3 import *
+import os
+class UnetConfig(PretrainedConfig):
+    model_type = "unet"
+    def __init__(
+        self,
+        dim=64,
+        init_dim=None,
+        out_dim=None,
+        dim_mults=(1, 2, 4, 8),
+        channels=3,
+        with_time_emb=True,
+        resnet_block_groups=8,
+        use_convnext=True,
+        convnext_mult=2,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.init_dim = init_dim
+        self.out_dim = out_dim
+        self.dim_mults = dim_mults
+        self.channels = channels
+        self.with_time_emb = with_time_emb
+        self.resnet_block_groups = resnet_block_groups
+        self.use_convnext = use_convnext
+        self.convnext_mult = convnext_mult
+class Unet(PreTrainedModel):
+    config_class = UnetConfig
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__(config)
+        # determine dimensions
+        self.channels = (
+            config.channels
+        )  # since we are concatenating the images and the conditionings along the channel dimension
+        init_dim = default(config.init_dim, config.dim // 3 * 2)
+        self.init_conv = nn.Conv2d(self.channels * 2, init_dim, 7, padding=3)
+        self.conditioning_init = nn.Conv2d(self.channels, init_dim, 7, padding=3)
+        dims = [init_dim, *map(lambda m: config.dim * m, config.dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.in_out = in_out
+        if config.use_convnext:
+            block_klass = partial(ConvNextBlock, mult=config.convnext_mult)
+        else:
+            block_klass = partial(ResnetBlock, groups=config.resnet_block_groups)
+        # time embeddings
+        if config.with_time_emb:
+            time_dim = config.dim * 4
+            self.time_mlp = nn.Sequential(
+                SinusoidalPositionEmbeddings(config.dim),
+                nn.Linear(config.dim, time_dim),
+                nn.GELU(),
+                nn.Linear(time_dim, time_dim),
+            )
+        else:
+            time_dim = None
+            self.time_mlp = None
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        self.conditioning_encoder = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        self.num_resolutions = num_resolutions
+        # conditioning encoder
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.conditioning_encoder.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_in, dim_out, time_emb_dim=time_dim),
+                        block_klass(dim_out, dim_out, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                        Downsample(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        mid_dim = dims[-1]
+        self.mid_block1 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        self.cross_attention_1 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.cross_attention_2 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.cross_attention_3 = Residual(
+            PreNorm(mid_dim, LinearCrossAttention(mid_dim))
+        )
+        self.mid_block2 = block_klass(mid_dim, mid_dim, time_emb_dim=time_dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (num_resolutions - 1)
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        block_klass(dim_out * 2, dim_in, time_emb_dim=time_dim),
+                        block_klass(dim_in, dim_in, time_emb_dim=time_dim),
+                        Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+                        Upsample(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+        out_dim = default(config.out_dim, config.channels)
+        self.final_conv = nn.Sequential(
+            block_klass(config.dim, config.dim), nn.Conv2d(config.dim, out_dim, 1)
+        )
+    def forward(self, x, time, implicit_conditioning, explicit_conditioning):
+        x = torch.cat((x, explicit_conditioning), dim=1)
+        x = self.init_conv(x)
+        conditioning = self.conditioning_init(implicit_conditioning)
+        t = self.time_mlp(time) if exists(self.time_mlp) else None
+        h = []
+        # conditioning encoder
+        for block1, attn, downsample in self.conditioning_encoder:
+            conditioning = block1(conditioning)
+            conditioning = attn(conditioning)
+            conditioning = downsample(conditioning)
+        for block1, block2, attn, downsample in self.downs:
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            h.append(x)
+            x = downsample(x)
+        # reverse the c list
+        # bottleneck
+        x = self.cross_attention_1(x, conditioning)
+        x = self.mid_block1(x, t)
+        x = self.cross_attention_2(x, conditioning)
+        x = self.mid_block2(x, t)
+        x = self.cross_attention_3(x, conditioning)
+        for block1, block2, attn, upsample in self.ups:
+            x = torch.cat((x, h.pop()), dim=1)
+            x = block1(x, t)
+            x = block2(x, t)
+            x = attn(x)
+            x = upsample(x)
+        return self.final_conv(x)