maskgct-audio-lab

Running on Zero

File size: 3,007 Bytes

c968fc3

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import math
import torch
from torch import nn
from .fvq import FactorizedVectorQuantize


class ResidualVQ(nn.Module):
    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""

    def __init__(self, *, num_quantizers, codebook_size, **kwargs):
        super().__init__()
        VQ = FactorizedVectorQuantize
        if type(codebook_size) == int:
            codebook_size = [codebook_size] * num_quantizers
        self.layers = nn.ModuleList(
            [VQ(codebook_size=2**size, **kwargs) for size in codebook_size]
        )
        self.num_quantizers = num_quantizers
        self.quantizer_dropout = kwargs.get("quantizer_dropout", 0.0)
        self.dropout_type = kwargs.get("dropout_type", None)

    def forward(self, x, n_quantizers=None):
        quantized_out = 0.0
        residual = x

        all_losses = []
        all_indices = []
        all_quantized = []

        if n_quantizers is None:
            n_quantizers = self.num_quantizers
        if self.training:
            n_quantizers = torch.ones((x.shape[0],)) * self.num_quantizers + 1
            if self.dropout_type == "linear":
                dropout = torch.randint(1, self.num_quantizers + 1, (x.shape[0],))
            elif self.dropout_type == "exp":
                dropout = torch.randint(
                    1, int(math.log2(self.num_quantizers)), (x.shape[0],)
                )
                dropout = torch.pow(2, dropout)
            n_dropout = int(x.shape[0] * self.quantizer_dropout)
            n_quantizers[:n_dropout] = dropout[:n_dropout]
            n_quantizers = n_quantizers.to(x.device)

        for idx, layer in enumerate(self.layers):
            if not self.training and idx >= n_quantizers:
                break
            quantized, indices, loss = layer(residual)

            mask = (
                torch.full((x.shape[0],), fill_value=idx, device=x.device)
                < n_quantizers
            )

            residual = residual - quantized

            quantized_out = quantized_out + quantized * mask[:, None, None]

            # loss
            loss = (loss * mask).mean()

            all_indices.append(indices)
            all_losses.append(loss)
            all_quantized.append(quantized)
        all_losses, all_indices, all_quantized = map(
            torch.stack, (all_losses, all_indices, all_quantized)
        )
        return quantized_out, all_indices, all_losses, all_quantized

    def vq2emb(self, vq):
        # vq: [n_quantizers, B, T]
        quantized_out = 0.0
        for idx, layer in enumerate(self.layers):
            quantized = layer.vq2emb(vq[idx])
            quantized_out += quantized
        return quantized_out

    def get_emb(self):
        embs = []
        for idx, layer in enumerate(self.layers):
            embs.append(layer.get_emb())
        return embs