qninhdt
/

swim

Model card Files Files and versions Community

qninhdt commited on 3 days ago

Commit

9b66f69

•

1 Parent(s): c367a8c

cc

Browse files

Files changed (10) hide show

scripts/build_cyclegan_dataset.py +47 -19
swim/__init__.py +0 -0
swim/attention_blocks.py +315 -0
swim/autoencoder.py +0 -0
swim/blocks.py +185 -0
swim/codeblock.py +74 -0
swim/discriminator.py +45 -0
swim/encoder.py +90 -0
swim/unet.py +169 -0
train.py +8 -0

scripts/build_cyclegan_dataset.py CHANGED Viewed

@@ -8,7 +8,8 @@ from tqdm import tqdm
 @click.option("--swim_dir", type=str, default="datasets/swim_data")
 @click.option("--output_dir", type=str, default="datasets/swim_data_cyclegan")
 @click.option("--type", type=str, help="fog|rain|snow|night", required=True)
-def build_cyclegan_dataset(swim_dir: str, output_dir: str, type: str):
     # build the dataset with format
     # swim_data_cyclegan
     # ├── trainA
@@ -42,25 +43,52 @@ def build_cyclegan_dataset(swim_dir: str, output_dir: str, type: str):
     with open(os.path.join(swim_dir, "val", "labels.json"), "r") as f:
         val_labels = json.load(f)
-    for label in tqdm(train_labels, desc="train"):
-        if label["weather"] == type:
-            os.system(
-                f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainB', label['name'])}"
-            )
-        elif label["weather"] == "clear":
-            os.system(
-                f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainA', label['name'])}"
-            )
-    for label in tqdm(val_labels, desc="val"):
-        if label["weather"] == type:
-            os.system(
-                f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testB', label['name'])}"
-            )
-        elif label["weather"] == "clear":
-            os.system(
-                f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testA', label['name'])}"
-            )
 if __name__ == "__main__":

 @click.option("--swim_dir", type=str, default="datasets/swim_data")
 @click.option("--output_dir", type=str, default="datasets/swim_data_cyclegan")
 @click.option("--type", type=str, help="fog|rain|snow|night", required=True)
+@click.option("--no_night", is_flag=True)
+def build_cyclegan_dataset(swim_dir: str, output_dir: str, type: str, no_night: bool):
     # build the dataset with format
     # swim_data_cyclegan
     # ├── trainA
     with open(os.path.join(swim_dir, "val", "labels.json"), "r") as f:
         val_labels = json.load(f)
+    if type != "night":
+        for label in tqdm(train_labels, desc="train"):
+            if no_night and label["timeofdata"] == "night":
+                continue
+            if label["weather"] == type:
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainB', label['name'])}"
+                )
+            elif label["weather"] == "clear":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainA', label['name'])}"
+                )
+        for label in tqdm(val_labels, desc="val"):
+            if no_night and label["timeofdata"] == "night":
+                continue
+            if label["weather"] == type:
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testB', label['name'])}"
+                )
+            elif label["weather"] == "clear":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testA', label['name'])}"
+                )
+    else:
+        for label in tqdm(train_labels, desc="train"):
+            if label["timeofdata"] == "night":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainB', label['name'])}"
+                )
+            elif label["timeofdata"] == "daytime":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainA', label['name'])}"
+                )
+        for label in tqdm(val_labels, desc="val"):
+            if label["timeofdata"] == "night":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testB', label['name'])}"
+                )
+            elif label["timeofdata"] == "daytime":
+                os.system(
+                    f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testA', label['name'])}"
+                )
 if __name__ == "__main__":

swim/__init__.py ADDED Viewed

File without changes

swim/attention_blocks.py ADDED Viewed

	@@ -0,0 +1,315 @@

+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+class SpatialTransformer(nn.Module):
+    """
+    ## Spatial Transformer
+    """
+    def __init__(self, channels: int, n_heads: int, n_layers: int, d_cond: int):
+        """
+        :param channels: is the number of channels in the feature map
+        :param n_heads: is the number of attention heads
+        :param n_layers: is the number of transformer layers
+        :param d_cond: is the size of the conditional embedding
+        """
+        super().__init__()
+        # Initial group normalization
+        self.norm = torch.nn.GroupNorm(
+            num_groups=32, num_channels=channels, eps=1e-6, affine=True
+        )
+        # Initial $1 \times 1$ convolution
+        self.proj_in = nn.Conv2d(channels, channels, kernel_size=1, stride=1, padding=0)
+        # Transformer layers
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    channels, n_heads, channels // n_heads, d_cond=d_cond
+                )
+                for _ in range(n_layers)
+            ]
+        )
+        # Final $1 \times 1$ convolution
+        self.proj_out = nn.Conv2d(
+            channels, channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x: torch.Tensor, cond: torch.Tensor):
+        """
+        :param x: is the feature map of shape `[batch_size, channels, height, width]`
+        :param cond: is the conditional embeddings of shape `[batch_size,  n_cond, d_cond]`
+        """
+        # Get shape `[batch_size, channels, height, width]`
+        b, c, h, w = x.shape
+        # For residual connection
+        x_in = x
+        # Normalize
+        x = self.norm(x)
+        # Initial $1 \times 1$ convolution
+        x = self.proj_in(x)
+        # Transpose and reshape from `[batch_size, channels, height, width]`
+        # to `[batch_size, height * width, channels]`
+        x = x.permute(0, 2, 3, 1).view(b, h * w, c)
+        # Apply the transformer layers
+        for block in self.transformer_blocks:
+            x = block(x, cond)
+        # Reshape and transpose from `[batch_size, height * width, channels]`
+        # to `[batch_size, channels, height, width]`
+        x = x.view(b, h, w, c).permute(0, 3, 1, 2)
+        # Final $1 \times 1$ convolution
+        x = self.proj_out(x)
+        # Add residual
+        return x + x_in
+class BasicTransformerBlock(nn.Module):
+    """
+    ### Transformer Layer
+    """
+    def __init__(self, d_model: int, n_heads: int, d_head: int, d_cond: int):
+        """
+        :param d_model: is the input embedding size
+        :param n_heads: is the number of attention heads
+        :param d_head: is the size of a attention head
+        :param d_cond: is the size of the conditional embeddings
+        """
+        super().__init__()
+        # Self-attention layer and pre-norm layer
+        self.attn1 = CrossAttention(d_model, d_model, n_heads, d_head)
+        self.norm1 = nn.LayerNorm(d_model)
+        # Cross attention layer and pre-norm layer
+        self.attn2 = CrossAttention(d_model, d_cond, n_heads, d_head)
+        self.norm2 = nn.LayerNorm(d_model)
+        # Feed-forward network and pre-norm layer
+        self.ff = FeedForward(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor):
+        """
+        :param x: are the input embeddings of shape `[batch_size, height * width, d_model]`
+        :param cond: is the conditional embeddings of shape `[batch_size,  n_cond, d_cond]`
+        """
+        # Self attention
+        x = self.attn1(self.norm1(x)) + x
+        # Cross-attention with conditioning
+        x = self.attn2(self.norm2(x), cond=cond) + x
+        # Feed-forward network
+        x = self.ff(self.norm3(x)) + x
+        #
+        return x
+class CrossAttention(nn.Module):
+    """
+    ### Cross Attention Layer
+    This falls-back to self-attention when conditional embeddings are not specified.
+    """
+    use_flash_attention: bool = False
+    def __init__(
+        self,
+        d_model: int,
+        d_cond: int,
+        n_heads: int,
+        d_head: int,
+        is_inplace: bool = True,
+    ):
+        """
+        :param d_model: is the input embedding size
+        :param n_heads: is the number of attention heads
+        :param d_head: is the size of a attention head
+        :param d_cond: is the size of the conditional embeddings
+        :param is_inplace: specifies whether to perform the attention softmax computation inplace to
+            save memory
+        """
+        super().__init__()
+        self.is_inplace = is_inplace
+        self.n_heads = n_heads
+        self.d_head = d_head
+        # Attention scaling factor
+        self.scale = d_head**-0.5
+        # Query, key and value mappings
+        d_attn = d_head * n_heads
+        self.to_q = nn.Linear(d_model, d_attn, bias=False)
+        self.to_k = nn.Linear(d_cond, d_attn, bias=False)
+        self.to_v = nn.Linear(d_cond, d_attn, bias=False)
+        # Final linear layer
+        self.to_out = nn.Sequential(nn.Linear(d_attn, d_model))
+        # Setup [flash attention](https://github.com/HazyResearch/flash-attention).
+        # Flash attention is only used if it's installed
+        # and `CrossAttention.use_flash_attention` is set to `True`.
+        # try:
+        #     # You can install flash attention by cloning their Github repo,
+        #     # [https://github.com/HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)
+        #     # and then running `python setup.py install`
+        #     from flash_attn.flash_attention import FlashAttention
+        #     self.flash = FlashAttention()
+        #     # Set the scale for scaled dot-product attention.
+        #     self.flash.softmax_scale = self.scale
+        # # Set to `None` if it's not installed
+        # except ImportError:
+        #     self.flash = None
+    def forward(self, x: torch.Tensor, cond: Optional[torch.Tensor] = None):
+        """
+        :param x: are the input embeddings of shape `[batch_size, height * width, d_model]`
+        :param cond: is the conditional embeddings of shape `[batch_size, n_cond, d_cond]`
+        """
+        # If `cond` is `None` we perform self attention
+        has_cond = cond is not None
+        if not has_cond:
+            cond = x
+        # Get query, key and value vectors
+        q = self.to_q(x)
+        k = self.to_k(cond)
+        v = self.to_v(cond)
+        # Use flash attention if it's available and the head size is less than or equal to `128`
+        if (
+            CrossAttention.use_flash_attention
+            and self.flash is not None
+            and not has_cond
+            and self.d_head <= 128
+        ):
+            return self.flash_attention(q, k, v)
+        # Otherwise, fallback to normal attention
+        else:
+            return self.normal_attention(q, k, v)
+    def flash_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        """
+        #### Flash Attention
+        :param q: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        :param k: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        :param v: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        """
+        # Get batch size and number of elements along sequence axis (`width * height`)
+        batch_size, seq_len, _ = q.shape
+        # Stack `q`, `k`, `v` vectors for flash attention, to get a single tensor of
+        # shape `[batch_size, seq_len, 3, n_heads * d_head]`
+        qkv = torch.stack((q, k, v), dim=2)
+        # Split the heads
+        qkv = qkv.view(batch_size, seq_len, 3, self.n_heads, self.d_head)
+        # Flash attention works for head sizes `32`, `64` and `128`, so we have to pad the heads to
+        # fit this size.
+        if self.d_head <= 32:
+            pad = 32 - self.d_head
+        elif self.d_head <= 64:
+            pad = 64 - self.d_head
+        elif self.d_head <= 128:
+            pad = 128 - self.d_head
+        else:
+            raise ValueError(f"Head size ${self.d_head} too large for Flash Attention")
+        # Pad the heads
+        if pad:
+            qkv = torch.cat(
+                (qkv, qkv.new_zeros(batch_size, seq_len, 3, self.n_heads, pad)), dim=-1
+            )
+        # Compute attention
+        # $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)V$$
+        # This gives a tensor of shape `[batch_size, seq_len, n_heads, d_padded]`
+        out, _ = self.flash(qkv)
+        # Truncate the extra head size
+        out = out[:, :, :, : self.d_head]
+        # Reshape to `[batch_size, seq_len, n_heads * d_head]`
+        out = out.reshape(batch_size, seq_len, self.n_heads * self.d_head)
+        # Map to `[batch_size, height * width, d_model]` with a linear layer
+        return self.to_out(out)
+    def normal_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        """
+        #### Normal Attention
+        :param q: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        :param k: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        :param v: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
+        """
+        # Split them to heads of shape `[batch_size, seq_len, n_heads, d_head]`
+        q = q.view(*q.shape[:2], self.n_heads, -1)
+        k = k.view(*k.shape[:2], self.n_heads, -1)
+        v = v.view(*v.shape[:2], self.n_heads, -1)
+        # Calculate attention $\frac{Q K^\top}{\sqrt{d_{key}}}$
+        attn = torch.einsum("bihd,bjhd->bhij", q, k) * self.scale
+        # Compute softmax
+        # $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)$$
+        if self.is_inplace:
+            half = attn.shape[0] // 2
+            attn[half:] = attn[half:].softmax(dim=-1)
+            attn[:half] = attn[:half].softmax(dim=-1)
+        else:
+            attn = attn.softmax(dim=-1)
+        # Compute attention output
+        # $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)V$$
+        out = torch.einsum("bhij,bjhd->bihd", attn, v)
+        # Reshape to `[batch_size, height * width, n_heads * d_head]`
+        out = out.reshape(*out.shape[:2], -1)
+        # Map to `[batch_size, height * width, d_model]` with a linear layer
+        return self.to_out(out)
+class FeedForward(nn.Module):
+    """
+    ### Feed-Forward Network
+    """
+    def __init__(self, d_model: int, d_mult: int = 4):
+        """
+        :param d_model: is the input embedding size
+        :param d_mult: is multiplicative factor for the hidden layer size
+        """
+        super().__init__()
+        self.net = nn.Sequential(
+            GeGLU(d_model, d_model * d_mult),
+            nn.Dropout(0.0),
+            nn.Linear(d_model * d_mult, d_model),
+        )
+    def forward(self, x: torch.Tensor):
+        return self.net(x)
+class GeGLU(nn.Module):
+    """
+    ### GeGLU Activation
+    $$\text{GeGLU}(x) = (xW + b) * \text{GELU}(xV + c)$$
+    """
+    def __init__(self, d_in: int, d_out: int):
+        super().__init__()
+        # Combined linear projections $xW + b$ and $xV + c$
+        self.proj = nn.Linear(d_in, d_out * 2)
+    def forward(self, x: torch.Tensor):
+        # Get $xW + b$ and $xV + c$
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        # $\text{GeGLU}(x) = (xW + b) * \text{GELU}(xV + c)$
+        return x * F.gelu(gate)

swim/autoencoder.py ADDED Viewed

File without changes

swim/blocks.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from abc import abstractmethod
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+def get_timestep_embedding(
+    timesteps: torch.Tensor, emb_dim: int, max_period: int = 10000
+) -> torch.Tensor:
+    half_dim = emb_dim // 2
+    emb = math.log(max_period) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if emb_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class GroupNorm(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.group_norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-06, affine=True
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.group_norm(x)
+class UpsampleBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, 3, padding=1)
+    def forward(self, x: torch.Tensor):
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        return self.conv(x)
+class DownsampleBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.op = nn.Conv2d(channels, channels, 3, stride=2, padding=1)
+    def forward(self, x: torch.Tensor):
+        return self.op(x)
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
+        pass
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, t_emb)
+            else:
+                x = layer(x)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        t_emb_dim: int = None,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        self.input_layers = nn.Sequential(
+            GroupNorm(in_channels),
+            nn.SiLU(),
+            nn.Conv2d(in_channels, out_channels, 3, padding=1),
+        )
+        if t_emb_dim is not None:
+            self.t_emb_layers = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(t_emb_dim, out_channels),
+            )
+        else:
+            self.t_emb_layers = None
+        self.output_layers = nn.Sequential(
+            GroupNorm(out_channels),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1),
+        )
+        if in_channels != out_channels:
+            self.skip = nn.Conv2d(in_channels, out_channels, 1)
+        else:
+            self.skip = nn.Identity()
+    def forward(self, x: torch.Tensor, t: torch.Tensor = None) -> torch.Tensor:
+        assert t is not None or self.t_emb_layers is None
+        h = self.input_layers(x)
+        if self.t_emb_layers is not None:
+            t_emb = self.t_emb_layers(t)
+            h = h + t_emb[:, :, None, None]
+        h = self.output_layers(h)
+        h = h + self.skip(x)
+        return h
+class AttentionBlock(nn.Module):
+    """Attention mechanism similar to transformers but for CNNs, paper https://arxiv.org/abs/1805.08318
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+    """
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        # normalization layer
+        self.norm = GroupNorm(in_channels)
+        # query, key and value layers
+        self.q = nn.Conv2d(in_channels, in_channels, 1, 1, 0)
+        self.k = nn.Conv2d(in_channels, in_channels, 1, 1, 0)
+        self.v = nn.Conv2d(in_channels, in_channels, 1, 1, 0)
+        self.project_out = nn.Conv2d(in_channels, in_channels, 1, 1, 0)
+        self.softmax = nn.Softmax(dim=2)
+    def forward(self, x):
+        batch, _, height, width = x.size()
+        x = self.norm(x)
+        # query, key and value layers
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        # resizing the output from 4D to 3D to generate attention map
+        q = q.reshape(batch, self.in_channels, height * width)
+        k = k.reshape(batch, self.in_channels, height * width)
+        v = v.reshape(batch, self.in_channels, height * width)
+        # transpose the query tensor for dot product
+        q = q.permute(0, 2, 1)
+        # main attention formula
+        scores = torch.bmm(q, k) * (self.in_channels**-0.5)
+        weights = self.softmax(scores)
+        weights = weights.permute(0, 2, 1)
+        attention = torch.bmm(v, weights)
+        # resizing the output from 3D to 4D to match the input
+        attention = attention.reshape(batch, self.in_channels, height, width)
+        attention = self.project_out(attention)
+        # adding the identity to the output
+        return x + attention

swim/codeblock.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+class CodeBook(nn.Module):
+    def __init__(
+        self, num_codebook_vectors: int = 1024, latent_dim: int = 256, beta: int = 0.25
+    ):
+        super().__init__()
+        self.num_codebook_vectors = num_codebook_vectors
+        self.latent_dim = latent_dim
+        self.beta = beta
+        # creating the codebook, nn.Embedding here is simply a 2D array mainly for storing our embeddings, it's also learnable
+        self.codebook = nn.Embedding(num_codebook_vectors, latent_dim)
+        # Initializing the weights in codebook in uniform distribution
+        self.codebook.weight.data.uniform_(
+            -1 / num_codebook_vectors, 1 / num_codebook_vectors
+        )
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        # Channel to last dimension and copying the tensor to store it in a contiguous ( in a sequence ) way
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(
+            -1, self.latent_dim
+        )  # b*h*w * latent_dim, will look similar to codebook in fig 2 of the paper
+        # calculating the distance between the z to the vectors in flattened codebook, from eq. 2
+        # (a - b)^2 = a^2 + b^2 - 2ab
+        distance = (
+            torch.sum(
+                z_flattened**2, dim=1, keepdim=True
+            )  # keepdim = True to keep the same original shape after the sum
+            + torch.sum(self.codebook.weight**2, dim=1)
+            - 2
+            * torch.matmul(
+                z_flattened, self.codebook.weight.t()
+            )  # 2*dot(z, codebook.T)
+        )
+        # getting indices of vectors with minimum distance from the codebook
+        min_distance_indices = torch.argmin(distance, dim=1)
+        # getting the corresponding vector from the codebook
+        z_q = self.codebook(min_distance_indices).view(z.shape)
+        """
+        this represent the equation 4 from the paper ( except the reconstruction loss ) . Thia loss will then be added
+        to GAN loss to create the final loss function for VQGAN, eq. 6 in the paper.
+        Note : In the first para of A. Changlog section of the paper,
+        they found a bug which resulted in beta equal to 1. here https://github.com/CompVis/taming-transformers/issues/57
+        just a note :)
+        """
+        loss = torch.mean(
+            (z_q.detach() - z) ** 2
+            # detach() to avoid calculating gradient while backpropagating
+            + self.beta
+            * torch.mean(
+                (z_q - z.detach()) ** 2
+            )  # commitment loss, detach() to avoid calculating gradient while backpropagating
+        )
+        # Not sure why we need this, but it's in the original implementation and mentions for "preserving gradients"
+        z_q = z + (z_q - z).detach()
+        # reshapring to the original shape
+        z_q = z_q.permute(0, 3, 1, 2)
+        return z_q, min_distance_indices, loss

swim/discriminator.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch.nn as nn
+class Discriminator(nn.Module):
+    """PatchGAN Discriminator
+    Args:
+        image_channels (int): Number of channels in the input image.
+        num_filters_last (int): Number of filters in the last layer of the discriminator.
+        n_layers (int): Number of layers in the discriminator.
+    """
+    def __init__(self, image_channels: int = 3, num_filters_last=64, n_layers=3):
+        super(Discriminator, self).__init__()
+        layers = [
+            nn.Conv2d(image_channels, num_filters_last, 4, 2, 1),
+            nn.LeakyReLU(0.2),
+        ]
+        num_filters_mult = 1
+        for i in range(1, n_layers + 1):
+            num_filters_mult_last = num_filters_mult
+            num_filters_mult = min(2**i, 8)
+            layers += [
+                nn.Conv2d(
+                    num_filters_last * num_filters_mult_last,
+                    num_filters_last * num_filters_mult,
+                    4,
+                    2 if i < n_layers else 1,
+                    1,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(num_filters_last * num_filters_mult),
+                nn.LeakyReLU(0.2, True),
+            ]
+        layers.append(nn.Conv2d(num_filters_last * num_filters_mult, 1, 4, 1, 1))
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)

swim/encoder.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import torch.nn as nn
+from .blocks import DownsampleBlock, GroupNorm, AttentionBlock, ResnetBlock
+class SwimEncoder(nn.Module):
+    """
+    The encoder part of the VQGAN.
+    Args:
+        img_channels (int): Number of channels in the input image.
+        image_size (int): Size of the input image, only used in encoder (height or width ).
+        latent_channels (int): Number of channels in the latent vector.
+        intermediate_channels (list): List of channels in the intermediate layers.
+        num_residual_blocks (int): Number of residual blocks b/w each downsample block.
+        dropout (float): Dropout probability for residual blocks.
+        attention_resolution (list): tensor size ( height or width ) at which to add attention blocks
+    """
+    def __init__(
+        self,
+        img_channels: int = 3,
+        image_size: int = 256,
+        latent_channels: int = 256,
+        intermediate_channels: list = [128, 128, 256, 256, 512],
+        num_residual_blocks: int = 2,
+        dropout: float = 0.0,
+        attention_resolution: list = [16],
+    ):
+        super().__init__()
+        # Inserting first intermediate channel to index 0
+        intermediate_channels.insert(0, intermediate_channels[0])
+        # Appends all the layers to this list
+        layers = []
+        # Addingt the first conv layer increase input channels to the first intermediate channels
+        layers.append(
+            nn.Conv2d(
+                img_channels,
+                intermediate_channels[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        )
+        # Loop over the intermediate channels except the last one
+        for n in range(len(intermediate_channels) - 1):
+            in_channels = intermediate_channels[n]
+            out_channels = intermediate_channels[n + 1]
+            # Adding the residual blocks for each channel
+            for _ in range(num_residual_blocks):
+                layers.append(ResnetBlock(in_channels, out_channels, dropout=dropout))
+                in_channels = out_channels
+                # Once we have downsampled the image to the size in attention resolution, we add attention blocks
+                if image_size in attention_resolution:
+                    layers.append(AttentionBlock(in_channels))
+            # only downsample for the first n-2 layers, and decrease the input size by a factor of 2
+            if n != len(intermediate_channels) - 2:
+                layers.append(DownsampleBlock(intermediate_channels[n + 1]))
+                image_size = image_size // 2  # Downsample by a factor of 2
+        in_channels = intermediate_channels[-1]
+        layers.extend(
+            [
+                ResnetBlock(
+                    in_channels=in_channels, out_channels=in_channels, dropout=dropout
+                ),
+                AttentionBlock(in_channels=in_channels),
+                ResnetBlock(
+                    in_channels=in_channels, out_channels=in_channels, dropout=dropout
+                ),
+                GroupNorm(in_channels=in_channels),
+                nn.SiLU(),
+                # increase the channels upto the latent vector channels
+                nn.Conv2d(
+                    in_channels, latent_channels, kernel_size=3, stride=1, padding=1
+                ),
+            ]
+        )
+        self.model = nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)

swim/unet.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import math
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .attention_blocks import SpatialTransformer
+from .blocks import (
+    DownSample,
+    ResnetBlock,
+    TimestepEmbedSequential,
+    UpSample,
+    Normalization,
+    get_timestep_embedding,
+)
+class UNet(nn.Module):
+    """
+    ## U-Net model
+    """
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int,
+        channels: int,
+        n_res_blocks: int,
+        attention_levels: List[int],
+        channel_multipliers: List[int],
+        n_heads: int,
+        tf_layers: int = 1,
+        d_cond: int = 768
+    ):
+        """
+        :param in_channels: is the number of channels in the input feature map
+        :param out_channels: is the number of channels in the output feature map
+        :param channels: is the base channel count for the model
+        :param n_res_blocks: number of residual blocks at each level
+        :param attention_levels: are the levels at which attention should be performed
+        :param channel_multipliers: are the multiplicative factors for number of channels for each level
+        :param n_heads: is the number of attention heads in the transformers
+        :param tf_layers: is the number of transformer layers in the transformers
+        :param d_cond: is the size of the conditional embedding in the transformers
+        """
+        super().__init__()
+        self.channels = channels
+        # Number of levels
+        levels = len(channel_multipliers)
+        # Size time embeddings
+        d_time_emb = channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(channels, d_time_emb),
+            nn.SiLU(),
+            nn.Linear(d_time_emb, d_time_emb),
+        )
+        # Input half of the U-Net
+        self.input_blocks = nn.ModuleList()
+        # Initial $3 \times 3$ convolution that maps the input to `channels`.
+        # The blocks are wrapped in `TimestepEmbedSequential` module because
+        # different modules have different forward function signatures;
+        # for example, convolution only accepts the feature map and
+        # residual blocks accept the feature map and time embedding.
+        # `TimestepEmbedSequential` calls them accordingly.
+        self.input_blocks.append(
+            TimestepEmbedSequential(nn.Conv2d(in_channels, channels, 3, padding=1))
+        )
+        # Number of channels at each block in the input half of U-Net
+        input_block_channels = [channels]
+        # Number of channels at each level
+        channels_list = [channels * m for m in channel_multipliers]
+        # Prepare levels
+        for i in range(levels):
+            # Add the residual blocks and attentions
+            for _ in range(n_res_blocks):
+                # Residual block maps from previous number of channels to the number of
+                # channels in the current level
+                layers = [
+                    ResnetBlock(channels, d_time_emb, out_channels=channels_list[i])
+                ]
+                channels = channels_list[i]
+                # Add transformer
+                if i in attention_levels:
+                    layers.append(
+                        SpatialTransformer(channels, n_heads, tf_layers, d_cond)
+                    )
+                # Add them to the input half of the U-Net and keep track of the number of channels of
+                # its output
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_channels.append(channels)
+            # Down sample at all levels except last
+            if i != levels - 1:
+                self.input_blocks.append(TimestepEmbedSequential(DownSample(channels)))
+                input_block_channels.append(channels)
+        # The middle of the U-Net
+        self.middle_block = TimestepEmbedSequential(
+            ResnetBlock(channels, d_time_emb),
+            SpatialTransformer(channels, n_heads, tf_layers, d_cond),
+            ResnetBlock(channels, d_time_emb),
+        )
+        # Second half of the U-Net
+        self.output_blocks = nn.ModuleList([])
+        # Prepare levels in reverse order
+        for i in reversed(range(levels)):
+            # Add the residual blocks and attentions
+            for j in range(n_res_blocks + 1):
+                # Residual block maps from previous number of channels plus the
+                # skip connections from the input half of U-Net to the number of
+                # channels in the current level.
+                layers = [
+                    ResnetBlock(
+                        channels + input_block_channels.pop(),
+                        d_time_emb,
+                        out_channels=channels_list[i],
+                    )
+                ]
+                channels = channels_list[i]
+                # Add transformer
+                if i in attention_levels:
+                    layers.append(
+                        SpatialTransformer(channels, n_heads, tf_layers, d_cond)
+                    )
+                # Up-sample at every level after last residual block
+                # except the last one.
+                # Note that we are iterating in reverse; i.e. `i == 0` is the last.
+                if i != 0 and j == n_res_blocks:
+                    layers.append(UpSample(channels))
+                # Add to the output half of the U-Net
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+        # Final normalization and $3 \times 3$ convolution
+        self.out = nn.Sequential(
+            Normalization(channels),
+            nn.SiLU(),
+            nn.Conv2d(channels, out_channels, 3, padding=1),
+        )
+    def forward(self, x: torch.Tensor, timesteps: torch.Tensor, cond: torch.Tensor):
+        """
+        :param x: is the input feature map of shape `[batch_size, channels, width, height]`
+        :param timesteps: are the time steps of shape `[batch_size]`
+        :param cond: conditioning of shape `[batch_size, n_cond, d_cond]`
+        """
+        # To store the input half outputs for skip connections
+        x_input_block = []
+        # Get time step embeddings
+        t_emb = get_timestep_embedding(timesteps, self.channels * 2)
+        t_emb = self.time_embed(t_emb)
+        # Input half of the U-Net
+        for module in self.input_blocks:
+            x = module(x, t_emb, cond)
+            x_input_block.append(x)
+        # Middle of the U-Net
+        x = self.middle_block(x, t_emb, cond)
+        # Output half of the U-Net
+        for module in self.output_blocks:
+            x = torch.cat([x, x_input_block.pop()], dim=1)
+            x = module(x, t_emb, cond)
+        # Final normalization and $3 \times 3$ convolution
+        return self.out(x)

train.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+from torchinfo import summary
+from swim.encoder import SwimEncoder
+encoder = SwimEncoder().to("meta")
+sample = torch.randn(1, 3, 512, 512).to("meta")
+summary(encoder, input_data=(sample,))