Spaces:

jadechoghari
/

OpenMusic

Running on Zero

App Files Files Community

jadechoghari commited on Sep 21, 2024

Commit

9b9e0ee

verified ·

1 Parent(s): c1af806

add model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

qa_mdt/.gitattributes +35 -0
qa_mdt/README.md +37 -0
qa_mdt/audioldm_train/.DS_Store +0 -0
qa_mdt/audioldm_train/__init__.py +1 -0
qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/conditional_models.py +1354 -0
qa_mdt/audioldm_train/config/mos_as_token/qa_mdt.yaml +169 -0
qa_mdt/audioldm_train/dataset_plugin.py +508 -0
qa_mdt/audioldm_train/losses/__init__.py +1 -0
qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/losses/contperceptual.py +160 -0
qa_mdt/audioldm_train/modules/.DS_Store +0 -0
qa_mdt/audioldm_train/modules/__init__.py +0 -0
qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/AudioMAE.py +151 -0
qa_mdt/audioldm_train/modules/audiomae/README.md +24 -0
qa_mdt/audioldm_train/modules/audiomae/__init__.py +0 -0
qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/audiovisual_dataset.py +256 -0
qa_mdt/audioldm_train/modules/audiomae/example.py +52 -0
qa_mdt/audioldm_train/modules/audiomae/models_mae.py +615 -0
qa_mdt/audioldm_train/modules/audiomae/models_vit.py +252 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__init__.py +2 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/model.py +329 -0
qa_mdt/audioldm_train/modules/audiomae/sequence_gen/sequence_input.py +737 -0
qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/audiomae/util/crop.py +43 -0
qa_mdt/audioldm_train/modules/audiomae/util/datasets.py +67 -0
qa_mdt/audioldm_train/modules/audiomae/util/lars.py +60 -0
qa_mdt/audioldm_train/modules/audiomae/util/lr_decay.py +78 -0
qa_mdt/audioldm_train/modules/audiomae/util/lr_sched.py +28 -0
qa_mdt/audioldm_train/modules/audiomae/util/misc.py +454 -0
qa_mdt/audioldm_train/modules/audiomae/util/patch_embed.py +127 -0
qa_mdt/audioldm_train/modules/audiomae/util/pos_embed.py +205 -0
qa_mdt/audioldm_train/modules/audiomae/util/stat.py +77 -0
qa_mdt/audioldm_train/modules/clap/__init__.py +0 -0
qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/clap/open_clip/__init__.py +25 -0
qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc +0 -0
qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-38.pyc +0 -0

qa_mdt/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

qa_mdt/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+---
+library_name: diffusers
+tags:
+- music
+---
+# Hugging Face Diffusers Implementation of QA-MDT
+**QADMT: Quality-Aware Diffusion for Text-to-Music 🎶**
+QADMT brings a new approach to text-to-music generation by using quality-aware training to tackle issues like low-fidelity audio and weak labeling in datasets.
+With a masked diffusion transformer (MDT), QADMT delivers SOTA results on MusicCaps and Song-Describer, enhancing both quality and musicality.
+## Usage:
+```bash
+!git lfs install
+!git clone https://huggingface.co/jadechoghari/qa-mdt
+```
+```bash
+pip install -r qa_mdt/requirements.txt
+pip install xformers==0.0.26.post1
+pip install torchlibrosa==0.0.9 librosa==0.9.2
+pip install -q pytorch_lightning==2.1.3 torchlibrosa==0.0.9 librosa==0.9.2 ftfy==6.1.1 braceexpand
+pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
+```
+```python
+from qa_mdt.pipeline import MOSDiffusionPipeline
+pipe = MOSDiffusionPipeline()
+pipe("A modern synthesizer creating futuristic soundscapes.")
+```
+# Enjoy the music!! 🎶

qa_mdt/audioldm_train/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

qa_mdt/audioldm_train/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import utilities

qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc ADDED Viewed

Binary file (29.2 kB). View file

qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

qa_mdt/audioldm_train/conditional_models.py ADDED Viewed

	@@ -0,0 +1,1354 @@

+import sys
+sys.path.append("src")
+import torch
+import logging
+import torch.nn as nn
+from qa_mdt.audioldm_train.modules.clap.open_clip import create_model
+from qa_mdt.audioldm_train.modules.clap.training.data import get_audio_features
+import torchaudio
+from transformers import (
+    RobertaTokenizer,
+    AutoTokenizer,
+    T5EncoderModel,
+    MT5EncoderModel,
+)
+import torch.nn.functional as F
+from qa_mdt.audioldm_train.modules.audiomae.AudioMAE import Vanilla_AudioMAE
+from qa_mdt.audioldm_train.modules.phoneme_encoder.encoder import TextEncoder
+from transformers import SpeechT5Processor, AutoTokenizer, GPT2Model, GPT2Tokenizer
+from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithTextPrenet
+from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.model import CLAP2AudioMAE
+from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.sequence_input import (
+    Sequence2AudioMAE,
+)
+import numpy as np
+from qa_mdt.audioldm_train.modules.audiomae.sequence_gen.model import Prenet
+import json
+with open('./qa_mdt/offset_pretrained_checkpoints.json', 'r') as config_file:
+    config_data = json.load(config_file)
+"""
+The model forward function can return three types of data:
+1. tensor: used directly as conditioning signal
+2. dict: where there is a main key as condition, there are also other key that you can use to pass loss function and itermediate result. etc.
+3. list: the length is 2, in which the first element is tensor, the second element is attntion mask.
+The output shape for the cross attention condition should be:
+x,x_mask = [bs, seq_len, emb_dim], [bs, seq_len]
+All the returned data, in which will be used as diffusion input, will need to be in float type
+"""
+class GPT2WordEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = GPT2Model.from_pretrained("gpt2").wte
+        self.device = None
+    def get_unconditional_condition(self, batchsize):
+        unconditional_condition = ["random"] * batchsize
+        return self(unconditional_condition)
+    def forward(self, text):
+        assert isinstance(text, list)
+        if self.device is None:
+            self.device = next(self.model.parameters()).device
+        tokenization_result = self.tokenizer(text, return_tensors="pt", padding=True)
+        input_ids, attn_mask = tokenization_result["input_ids"].to(
+            self.device
+        ), tokenization_result["attention_mask"].to(self.device)
+        input_embed = self.model(input_ids.long())
+        return [input_embed, attn_mask]
+class ConcateBandWidthCond(nn.Module):
+    def __init__(self, latent_t_size, latent_f_size):
+        super().__init__()
+        self.placeholder = nn.Linear(1, 1)
+        self.latent_t_size = latent_t_size
+        self.latent_f_size = latent_f_size
+        self.device = None
+    def get_unconditional_condition(self, batchsize):
+        return torch.zeros((batchsize, self.latent_t_size, self.latent_f_size)).to(
+            self.device
+        )
+    def forward(self, mel_spec_bandwidth_cond_extra_channel):
+        if self.device is None:
+            self.device = mel_spec_bandwidth_cond_extra_channel.device
+        return mel_spec_bandwidth_cond_extra_channel
+class BandwidthEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = nn.Embedding(1000, 128)
+        nn.init.normal_(self.emb.weight, 0.0, 128**-0.5)
+        self.linear_bandwidth = nn.Linear(128, 128)
+        self.unconditional_condition = torch.zeros((1, 256))
+        self.device = None
+    def get_unconditional_condition(self, batchsize):
+        return self.unconditional_condition.expand(batchsize, 256)
+    def forward(self, bandwidth):
+        if self.device is None:
+            self.device = next(self.linear_bandwidth.parameters()).device
+            self.unconditional_condition = self.unconditional_condition.to(self.device)
+        # freq_energy_percentile
+        lower_cutoff, higher_cutoff = bandwidth[..., 0], bandwidth[..., 1]
+        # lower_cutoff, higher_cutoff = lower_cutoff*0+5, higher_cutoff*0+300
+        lower_cutoff_emb = self.linear_bandwidth(self.emb(lower_cutoff.long()))
+        higher_cutoff_emb = self.linear_bandwidth(self.emb(higher_cutoff.long()))
+        cutoff_emb = torch.cat([lower_cutoff_emb, higher_cutoff_emb], dim=-1)
+        # [bs, 256]
+        return cutoff_emb
+class SpeechT5TextEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        self.model = SpeechT5EncoderWithTextPrenet.from_pretrained(
+            "microsoft/speecht5_tts"
+        )
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.model.eval()
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        device = self.model.device
+        hidden_state = torch.zeros((batchsize, 1, 768)).to(device)
+        attention_mask = torch.ones((batchsize, 1)).to(device)
+        return [hidden_state.float(), attention_mask.float()]
+    def forward(self, text):
+        with torch.no_grad():
+            device = self.model.device
+            inputs = self.processor(text=text, return_tensors="pt", padding=True)
+            input_ids, attention_mask = inputs["input_ids"].to(device), inputs[
+                "attention_mask"
+            ].to(device)
+            emb = self.model(input_ids, attention_mask)
+            emb = emb.last_hidden_state.detach()
+        return [emb.float(), attention_mask.float()]
+class PhonemeEncoder(nn.Module):
+    def __init__(self, vocabs_size=41, pad_length=250, pad_token_id=None):
+        super().__init__()
+        """
+            encoder = PhonemeEncoder(40)
+            data = torch.randint(0, 39, (2, 250))
+            output = encoder(data)
+            import ipdb;ipdb.set_trace()
+        """
+        assert pad_token_id is not None
+        self.device = None
+        self.PAD_LENGTH = int(pad_length)
+        self.pad_token_id = pad_token_id
+        self.pad_token_sequence = torch.tensor([self.pad_token_id] * self.PAD_LENGTH)
+        self.text_encoder = TextEncoder(
+            n_vocab=vocabs_size,
+            out_channels=192,
+            hidden_channels=192,
+            filter_channels=768,
+            n_heads=2,
+            n_layers=6,
+            kernel_size=3,
+            p_dropout=0.1,
+        )
+        self.learnable_positional_embedding = torch.nn.Parameter(
+            torch.zeros((1, 192, self.PAD_LENGTH))
+        )  # [batchsize, seqlen, padlen]
+        self.learnable_positional_embedding.requires_grad = True
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        unconditional_tokens = self.pad_token_sequence.expand(
+            batchsize, self.PAD_LENGTH
+        )
+        return self(unconditional_tokens)  # Need to return float type
+    # def get_unconditional_condition(self, batchsize):
+    #     hidden_state = torch.zeros((batchsize, self.PAD_LENGTH, 192)).to(self.device)
+    #     attention_mask = torch.ones((batchsize, self.PAD_LENGTH)).to(self.device)
+    #     return [hidden_state, attention_mask] # Need to return float type
+    def _get_src_mask(self, phoneme):
+        src_mask = phoneme != self.pad_token_id
+        return src_mask
+    def _get_src_length(self, phoneme):
+        src_mask = self._get_src_mask(phoneme)
+        length = torch.sum(src_mask, dim=-1)
+        return length
+    # def make_empty_condition_unconditional(self, src_length, text_emb, attention_mask):
+    #     # src_length: [bs]
+    #     # text_emb: [bs, 192, pad_length]
+    #     # attention_mask: [bs, pad_length]
+    #     mask = src_length[..., None, None] > 1
+    #     text_emb = text_emb * mask
+    #     attention_mask[src_length < 1] = attention_mask[src_length < 1] * 0.0 + 1.0
+    #     return text_emb, attention_mask
+    def forward(self, phoneme_idx):
+        if self.device is None:
+            self.device = self.learnable_positional_embedding.device
+            self.pad_token_sequence = self.pad_token_sequence.to(self.device)
+        src_length = self._get_src_length(phoneme_idx)
+        text_emb, m, logs, text_emb_mask = self.text_encoder(phoneme_idx, src_length)
+        text_emb = text_emb + self.learnable_positional_embedding
+        # text_emb, text_emb_mask = self.make_empty_condition_unconditional(src_length, text_emb, text_emb_mask)
+        return [
+            text_emb.permute(0, 2, 1),
+            text_emb_mask.squeeze(1),
+        ]  # [2, 250, 192], [2, 250]
+class FlanT5HiddenState(nn.Module):
+    """
+    llama = FlanT5HiddenState()
+    data = ["","this is not an empty sentence"]
+    encoder_hidden_states = llama(data)
+    import ipdb;ipdb.set_trace()
+    """
+    def __init__(
+        self, text_encoder_name=config_data['flan_t5'], freeze_text_encoder=True
+    ):
+        super().__init__()
+        self.freeze_text_encoder = freeze_text_encoder
+        ## MODIFIED
+        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+        self.model = T5EncoderModel.from_pretrained("google/flan-t5-large")
+        if freeze_text_encoder:
+            self.model.eval()
+            for p in self.model.parameters():
+                p.requires_grad = False
+        else:
+            print("=> The text encoder is learnable")
+        self.empty_hidden_state_cfg = None
+        self.device = None
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        param = next(self.model.parameters())
+        if self.freeze_text_encoder:
+            assert param.requires_grad == False
+        # device = param.device
+        if self.empty_hidden_state_cfg is None:
+            self.empty_hidden_state_cfg, _ = self([""])
+        hidden_state = torch.cat([self.empty_hidden_state_cfg] * batchsize).float()
+        attention_mask = (
+            torch.ones((batchsize, hidden_state.size(1)))
+            .to(hidden_state.device)
+            .float()
+        )
+        return [hidden_state, attention_mask]  # Need to return float type
+    def forward(self, batch):
+        param = next(self.model.parameters())
+        if self.freeze_text_encoder:
+            assert param.requires_grad == False
+        if self.device is None:
+            self.device = param.device
+        # print("Manually change text")
+        # for i in range(len(batch)):
+        #     batch[i] = "dog barking"
+        try:
+            return self.encode_text(batch)
+        except Exception as e:
+            print(e, batch)
+            logging.exception("An error occurred: %s", str(e))
+    def encode_text(self, prompt):
+        device = self.model.device
+        batch = self.tokenizer(
+            prompt,
+            max_length=128,  # self.tokenizer.model_max_length
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
+            device
+        )
+        # Get text encoding
+        if self.freeze_text_encoder:
+            with torch.no_grad():
+                encoder_hidden_states = self.model(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )[0]
+        else:
+            encoder_hidden_states = self.model(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        return [
+            encoder_hidden_states.detach(),
+            attention_mask.float(),
+        ]  # Attention mask == 1 means usable token
+class FlanT5HiddenStatePaddedSameLength(nn.Module):
+    """
+    llama = FlanT5HiddenState()
+    data = ["","this is not an empty sentence"]
+    encoder_hidden_states = llama(data)
+    import ipdb;ipdb.set_trace()
+    """
+    def __init__(
+        self, text_encoder_name="google/flan-t5-large", freeze_text_encoder=True
+    ):
+        super().__init__()
+        self.freeze_text_encoder = freeze_text_encoder
+        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+        self.model = T5EncoderModel.from_pretrained("google/flan-t5-large")
+        if freeze_text_encoder:
+            self.model.eval()
+            for p in self.model.parameters():
+                p.requires_grad = False
+        else:
+            print("=> The text encoder is learnable")
+        self.empty_hidden_state_cfg = None
+        self.device = None
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        param = next(self.model.parameters())
+        if self.freeze_text_encoder:
+            assert param.requires_grad == False
+        # device = param.device
+        if self.empty_hidden_state_cfg is None:
+            self.empty_hidden_state_cfg, _ = self([""])
+        hidden_state = torch.cat([self.empty_hidden_state_cfg] * batchsize).float()
+        attention_mask = (
+            torch.ones((batchsize, hidden_state.size(1)))
+            .to(hidden_state.device)
+            .float()
+        )
+        return [hidden_state, attention_mask]  # Need to return float type
+    def forward(self, batch):
+        param = next(self.model.parameters())
+        if self.freeze_text_encoder:
+            assert param.requires_grad == False
+        if self.device is None:
+            self.device = param.device
+        # print("Manually change text")
+        # for i in range(len(batch)):
+        #     batch[i] = "dog barking"
+        try:
+            text_embed = self.encode_text(batch)
+            return text_embed
+        except Exception as e:
+            print(e, batch)
+            logging.exception("An error occurred: %s", str(e))
+    def encode_text(self, prompt):
+        device = self.model.device
+        batch = self.tokenizer(
+            prompt,
+            max_length=128,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
+            device
+        )
+        # Get text encoding
+        if self.freeze_text_encoder:
+            with torch.no_grad():
+                encoder_hidden_states = self.model(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )[0]
+        else:
+            encoder_hidden_states = self.model(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        return [
+            encoder_hidden_states.detach(),
+            attention_mask.float(),
+        ]  # Attention mask == 1 means usable token
+class CLAPGenAudioMAECond(CLAP2AudioMAE):
+    def __init__(
+        self,
+        cond_stage_config,
+        learnable=True,
+        pretrained_path=None,
+        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
+        use_gt_mae_prob=None,
+    ):  # The prob of using AudioMAE GT
+        super().__init__(base_learning_rate=1e-5, cond_stage_config=cond_stage_config)
+        assert use_gt_mae_output is not None and use_gt_mae_prob is not None
+        if pretrained_path is not None:
+            print("Reload CLAPGenAudioMAECond from %s" % pretrained_path)
+            state_dict = torch.load(pretrained_path)["state_dict"]
+            self.load_state_dict(state_dict)
+        self.use_gt_mae_output = use_gt_mae_output
+        self.use_gt_mae_prob = use_gt_mae_prob
+        self.learnable = learnable
+        if not learnable:
+            # Only optimize the GPT2 model
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.eval()
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        return_dict = self.cfg_uncond(batchsize)
+        return return_dict
+    def forward(self, batch):
+        # The conditional module can return both tensor or dictionaries
+        # The returned tensor will be corresponding to the cond_stage_key
+        # The returned dict will have keys that correspond to the cond_stage_key
+        ret_dict = {}
+        if self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob:
+            cond_dict = self.get_input(batch)
+            # Used as condition
+            ret_dict["crossattn_clap_to_audiomae_feature"] = [
+                cond_dict["crossattn_audiomae_pooled"][0],
+                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
+            ]  # Input sequence and mask
+        else:
+            # Used as condition
+            input_embeds, cond_dict = self.generate(batch)
+            input_embeds_mask = (
+                torch.ones((input_embeds.size(0), input_embeds.size(1)))
+                .to(input_embeds.device)
+                .float()
+            )
+            ret_dict["crossattn_clap_to_audiomae_feature"] = [
+                input_embeds,
+                input_embeds_mask,
+            ]  # Input sequence and mask
+        # If the following two keys are not in cond_stage_key, then they will not be used as condition
+        ret_dict["film_clap_cond1"] = cond_dict[
+            "film_clap_cond1"
+        ]  # the clap target latent
+        ret_dict["crossattn_audiomae_pooled"] = cond_dict[
+            "crossattn_audiomae_pooled"
+        ]  # audiomae target latent
+        if self.learnable and self.training:
+            loss = self.training_step(batch, cond_dict=cond_dict)
+            ret_dict["noncond_loss_clap2audiomae"] = loss
+        return ret_dict
+class SequenceGenAudioMAECond(Sequence2AudioMAE):
+    def __init__(
+        self,
+        cond_stage_config,
+        base_learning_rate,
+        sequence_gen_length,
+        sequence_input_key,
+        sequence_input_embed_dim,
+        batchsize,
+        always_output_audiomae_gt=False,
+        pretrained_path=None,
+        force_reload_pretrain_avoid_overwrite=False,
+        learnable=True,
+        use_warmup=True,
+        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
+        use_gt_mae_prob=None,
+    ):  # The prob of using AudioMAE GT
+        if use_warmup:
+            print(
+                "Warning: You didn't initialize sequence prediction module with trainer. Set warmup to False. You can still use the warmup scheme from the latent diffusion model."
+            )
+            use_warmup = False
+        super().__init__(
+            base_learning_rate=base_learning_rate,
+            cond_stage_config=cond_stage_config,
+            sequence_gen_length=sequence_gen_length,
+            sequence_input_key=sequence_input_key,
+            use_warmup=use_warmup,
+            sequence_input_embed_dim=sequence_input_embed_dim,
+            batchsize=batchsize,
+        )
+        assert use_gt_mae_output is not None and use_gt_mae_prob is not None
+        self.always_output_audiomae_gt = always_output_audiomae_gt
+        self.force_reload_pretrain_avoid_overwrite = (
+            force_reload_pretrain_avoid_overwrite
+        )
+        self.pretrained_path = pretrained_path
+        if self.force_reload_pretrain_avoid_overwrite:
+            self.is_reload = False
+        else:
+            self.is_reload = True
+        self.load_pretrain_model()
+        self.use_gt_mae_output = use_gt_mae_output
+        self.use_gt_mae_prob = use_gt_mae_prob
+        self.learnable = learnable
+        if not learnable:
+            # Only optimize the GPT2 model
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.eval()
+    def load_pretrain_model(self):
+        if self.pretrained_path is not None:
+            print("Reload SequenceGenAudioMAECond from %s" % self.pretrained_path)
+            state_dict = torch.load(self.pretrained_path)["state_dict"]
+            self.load_state_dict(state_dict)
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        return_dict = self.cfg_uncond(batchsize)
+        return_dict["crossattn_audiomae_generated"] = [
+            return_dict["crossattn_audiomae_pooled"][0],
+            torch.ones_like(return_dict["crossattn_audiomae_pooled"][1]).float(),
+        ]
+        return return_dict
+    def forward(self, batch):
+        # The conditional module can return both tensor or dictionaries
+        # The returned tensor will be corresponding to the cond_stage_key
+        # The returned dict will have keys that correspond to the cond_stage_key
+        ret_dict = {}
+        if self.force_reload_pretrain_avoid_overwrite and not self.is_reload:
+            self.load_pretrain_model()
+            self.is_reload = True
+        self.check_module_param_update()
+        if self.always_output_audiomae_gt or (
+            self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob
+        ):
+            cond_dict = self.get_input(batch)
+            ret_dict["crossattn_audiomae_generated"] = [
+                cond_dict["crossattn_audiomae_pooled"][0],
+                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
+            ]  # Input sequence and mask
+            # _, output = self.training_step(batch, cond_dict=cond_dict, return_output=True)
+            # ret_dict["crossattn_audiomae_generated"] = [output, torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float()] # Input sequence and mask
+        else:
+            if not self.training:
+                print("--------------> Generate !!!!!!!!!!!!")
+            input_embeds, cond_dict = self.generate(batch)
+            # print("Generate Partial!!!!"); input_embeds, cond_dict = self.generate_partial(batch)
+            input_embeds_mask = (
+                torch.ones((input_embeds.size(0), input_embeds.size(1)))
+                .to(input_embeds.device)
+                .float()
+            )
+            ret_dict["crossattn_audiomae_generated"] = [
+                input_embeds,
+                input_embeds_mask,
+            ]  # Input sequence and mask
+        # If the following two keys are not in cond_stage_key, then they will not be used as condition
+        for key in cond_dict.keys():
+            ret_dict[key] = cond_dict[key]
+        if self.learnable and self.training:
+            loss = self.training_step(batch, cond_dict=cond_dict)
+            ret_dict["noncond_loss_clap2audiomae"] = loss
+        return ret_dict
+class SequenceGenAudioMAECond_AudioMAE_PostNet(Sequence2AudioMAE):
+    def __init__(
+        self,
+        cond_stage_config,
+        base_learning_rate,
+        sequence_gen_length,
+        sequence_input_key,
+        sequence_input_embed_dim,
+        batchsize,
+        always_output_audiomae_gt=False,
+        pretrained_path=None,
+        use_ar_gen_loss=False,
+        force_reload_pretrain_avoid_overwrite=False,
+        learnable=True,
+        use_warmup=True,
+        use_gt_mae_output=None,  # False: does not use AudioMAE GT, True: Use AudioMAE GT
+        use_gt_mae_prob=None,
+    ):  # The prob of using AudioMAE GT
+        if use_warmup:
+            print(
+                "Warning: You didn't initialize sequence prediction module with trainer. Set warmup to False. You can still use the warmup scheme from the latent diffusion model."
+            )
+            use_warmup = False
+        super().__init__(
+            base_learning_rate=base_learning_rate,
+            cond_stage_config=cond_stage_config,
+            sequence_gen_length=sequence_gen_length,
+            sequence_input_key=sequence_input_key,
+            use_ar_gen_loss=use_ar_gen_loss,
+            use_warmup=use_warmup,
+            sequence_input_embed_dim=sequence_input_embed_dim,
+            batchsize=batchsize,
+        )
+        assert use_gt_mae_output is not None and use_gt_mae_prob is not None
+        self.always_output_audiomae_gt = always_output_audiomae_gt
+        self.force_reload_pretrain_avoid_overwrite = (
+            force_reload_pretrain_avoid_overwrite
+        )
+        self.pretrained_path = pretrained_path
+        if self.force_reload_pretrain_avoid_overwrite:
+            self.is_reload = False
+        else:
+            self.is_reload = True
+        self.load_pretrain_model()
+        self.prenet = Prenet(in_dim=768, sizes=[768, 768, 768], dropout_rate=0.5)
+        self.use_gt_mae_output = use_gt_mae_output
+        self.use_gt_mae_prob = use_gt_mae_prob
+        self.learnable = learnable
+        if not learnable:
+            # Only optimize the GPT2 model
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.eval()
+    def load_pretrain_model(self):
+        if self.pretrained_path is not None:
+            print("Reload SequenceGenAudioMAECond from %s" % self.pretrained_path)
+            state_dict = torch.load(self.pretrained_path)["state_dict"]
+            self.load_state_dict(state_dict)
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        return_dict = self.cfg_uncond(batchsize)
+        return_dict["crossattn_audiomae_generated"] = [
+            return_dict["crossattn_audiomae_pooled"][0],
+            torch.ones_like(return_dict["crossattn_audiomae_pooled"][1]).float(),
+        ]
+        return return_dict
+    def forward(self, batch):
+        # The conditional module can return both tensor or dictionaries
+        # The returned tensor will be corresponding to the cond_stage_key
+        # The returned dict will have keys that correspond to the cond_stage_key
+        ret_dict = {}
+        if self.force_reload_pretrain_avoid_overwrite and not self.is_reload:
+            self.load_pretrain_model()
+            self.is_reload = True
+        self.check_module_param_update()
+        if self.always_output_audiomae_gt or (
+            self.use_gt_mae_output and torch.rand(1).item() < self.use_gt_mae_prob
+        ):
+            cond_dict = self.get_input(batch)
+            gt_audiomae = self.prenet(cond_dict["crossattn_audiomae_pooled"][0])
+            ret_dict["crossattn_audiomae_generated"] = [
+                gt_audiomae,
+                torch.ones_like(cond_dict["crossattn_audiomae_pooled"][1]).float(),
+            ]  # Input sequence and mask
+        else:
+            print("--------------> Generate!!!!!!!!!!!!")
+            input_embeds, cond_dict = self.generate(batch)
+            # input_embeds, cond_dict = self.generate_partial(batch)
+            input_embeds = self.prenet(input_embeds)
+            input_embeds_mask = (
+                torch.ones((input_embeds.size(0), input_embeds.size(1)))
+                .to(input_embeds.device)
+                .float()
+            )
+            ret_dict["crossattn_audiomae_generated"] = [
+                input_embeds,
+                input_embeds_mask,
+            ]  # Input sequence and mask
+        # If the following two keys are not in cond_stage_key, then they will not be used as condition
+        for key in cond_dict.keys():
+            ret_dict[key] = cond_dict[key]
+        if self.learnable and self.training:
+            loss = self.training_step(batch, cond_dict=cond_dict)
+            ret_dict["noncond_loss_clap2audiomae"] = loss
+        return ret_dict
+class AudioMAEConditionCTPoolRandTFSeparated(nn.Module):
+    """
+    audiomae = AudioMAEConditionCTPool2x2()
+    data = torch.randn((4, 1024, 128))
+    output = audiomae(data)
+    import ipdb;ipdb.set_trace()
+    exit(0)
+    """
+    def __init__(
+        self,
+        time_pooling_factors=[1, 2, 4, 8],
+        freq_pooling_factors=[1, 2, 4, 8],
+        eval_time_pooling=None,
+        eval_freq_pooling=None,
+        mask_ratio=0.0,
+        regularization=False,
+        no_audiomae_mask=True,
+        no_audiomae_average=False,
+    ):
+        super().__init__()
+        self.device = None
+        self.time_pooling_factors = time_pooling_factors
+        self.freq_pooling_factors = freq_pooling_factors
+        self.no_audiomae_mask = no_audiomae_mask
+        self.no_audiomae_average = no_audiomae_average
+        self.eval_freq_pooling = eval_freq_pooling
+        self.eval_time_pooling = eval_time_pooling
+        self.mask_ratio = mask_ratio
+        self.use_reg = regularization
+        self.audiomae = Vanilla_AudioMAE()
+        self.audiomae.eval()
+        for p in self.audiomae.parameters():
+            p.requires_grad = False
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        param = next(self.audiomae.parameters())
+        assert param.requires_grad == False
+        device = param.device
+        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
+        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+            self.eval_freq_pooling, 8
+        )
+        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
+        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
+        token_num = int(512 / (time_pool * freq_pool))
+        return [
+            torch.zeros((batchsize, token_num, 768)).to(device).float(),
+            torch.ones((batchsize, token_num)).to(device).float(),
+        ]
+    def pool(self, representation, time_pool=None, freq_pool=None):
+        assert representation.size(-1) == 768
+        representation = representation[:, 1:, :].transpose(1, 2)
+        bs, embedding_dim, token_num = representation.size()
+        representation = representation.reshape(bs, embedding_dim, 64, 8)
+        if self.training:
+            if time_pool is None and freq_pool is None:
+                time_pool = min(
+                    64,
+                    self.time_pooling_factors[
+                        np.random.choice(list(range(len(self.time_pooling_factors))))
+                    ],
+                )
+                freq_pool = min(
+                    8,
+                    self.freq_pooling_factors[
+                        np.random.choice(list(range(len(self.freq_pooling_factors))))
+                    ],
+                )
+                # freq_pool = min(8, time_pool) # TODO here I make some modification.
+        else:
+            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+                self.eval_freq_pooling, 8
+            )
+        self.avgpooling = nn.AvgPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        self.maxpooling = nn.MaxPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        pooled = (
+            self.avgpooling(representation) + self.maxpooling(representation)
+        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
+        pooled = pooled.flatten(2).transpose(1, 2)
+        return pooled  # [bs, token_num, embedding_dim]
+    def regularization(self, x):
+        assert x.size(-1) == 768
+        x = F.normalize(x, p=2, dim=-1)
+        return x
+    # Required
+    def forward(self, batch, time_pool=None, freq_pool=None):
+        assert batch.size(-2) == 1024 and batch.size(-1) == 128
+        if self.device is None:
+            self.device = batch.device
+        batch = batch.unsqueeze(1)
+        with torch.no_grad():
+            representation = self.audiomae(
+                batch,
+                mask_ratio=self.mask_ratio,
+                no_mask=self.no_audiomae_mask,
+                no_average=self.no_audiomae_average,
+            )
+            representation = self.pool(representation, time_pool, freq_pool)
+            if self.use_reg:
+                representation = self.regularization(representation)
+            return [
+                representation,
+                torch.ones((representation.size(0), representation.size(1)))
+                .to(representation.device)
+                .float(),
+            ]
+class AudioMAEConditionCTPoolRand(nn.Module):
+    """
+    audiomae = AudioMAEConditionCTPool2x2()
+    data = torch.randn((4, 1024, 128))
+    output = audiomae(data)
+    import ipdb;ipdb.set_trace()
+    exit(0)
+    """
+    def __init__(
+        self,
+        time_pooling_factors=[1, 2, 4, 8],
+        freq_pooling_factors=[1, 2, 4, 8],
+        eval_time_pooling=None,
+        eval_freq_pooling=None,
+        mask_ratio=0.0,
+        regularization=False,
+        no_audiomae_mask=True,
+        no_audiomae_average=False,
+    ):
+        super().__init__()
+        self.device = None
+        self.time_pooling_factors = time_pooling_factors
+        self.freq_pooling_factors = freq_pooling_factors
+        self.no_audiomae_mask = no_audiomae_mask
+        self.no_audiomae_average = no_audiomae_average
+        self.eval_freq_pooling = eval_freq_pooling
+        self.eval_time_pooling = eval_time_pooling
+        self.mask_ratio = mask_ratio
+        self.use_reg = regularization
+        self.audiomae = Vanilla_AudioMAE()
+        self.audiomae.eval()
+        for p in self.audiomae.parameters():
+            p.requires_grad = False
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        param = next(self.audiomae.parameters())
+        assert param.requires_grad == False
+        device = param.device
+        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
+        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+            self.eval_freq_pooling, 8
+        )
+        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
+        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
+        token_num = int(512 / (time_pool * freq_pool))
+        return [
+            torch.zeros((batchsize, token_num, 768)).to(device).float(),
+            torch.ones((batchsize, token_num)).to(device).float(),
+        ]
+    def pool(self, representation, time_pool=None, freq_pool=None):
+        assert representation.size(-1) == 768
+        representation = representation[:, 1:, :].transpose(1, 2)
+        bs, embedding_dim, token_num = representation.size()
+        representation = representation.reshape(bs, embedding_dim, 64, 8)
+        if self.training:
+            if time_pool is None and freq_pool is None:
+                time_pool = min(
+                    64,
+                    self.time_pooling_factors[
+                        np.random.choice(list(range(len(self.time_pooling_factors))))
+                    ],
+                )
+                # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
+                freq_pool = min(8, time_pool)  # TODO here I make some modification.
+        else:
+            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+                self.eval_freq_pooling, 8
+            )
+        self.avgpooling = nn.AvgPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        self.maxpooling = nn.MaxPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        pooled = (
+            self.avgpooling(representation) + self.maxpooling(representation)
+        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
+        pooled = pooled.flatten(2).transpose(1, 2)
+        return pooled  # [bs, token_num, embedding_dim]
+    def regularization(self, x):
+        assert x.size(-1) == 768
+        x = F.normalize(x, p=2, dim=-1)
+        return x
+    # Required
+    def forward(self, batch, time_pool=None, freq_pool=None):
+        assert batch.size(-2) == 1024 and batch.size(-1) == 128
+        if self.device is None:
+            self.device = batch.device
+        batch = batch.unsqueeze(1)
+        with torch.no_grad():
+            representation = self.audiomae(
+                batch,
+                mask_ratio=self.mask_ratio,
+                no_mask=self.no_audiomae_mask,
+                no_average=self.no_audiomae_average,
+            )
+            representation = self.pool(representation, time_pool, freq_pool)
+            if self.use_reg:
+                representation = self.regularization(representation)
+            return [
+                representation,
+                torch.ones((representation.size(0), representation.size(1)))
+                .to(representation.device)
+                .float(),
+            ]
+class ConditionalToken(nn.Module):
+    def __init__(self, embedding_dim):
+        super(ConditionalToken, self).__init__()
+        self.embedding_dim = embedding_dim
+        # Define the conditional tokens as fixed values
+        self.pooling_factor_tokens = {
+            1: torch.Tensor([1.0, 0.0] * (embedding_dim // 2)),
+            2: torch.Tensor([0.0, 1.0] * (embedding_dim // 2)),
+            4: torch.Tensor([1.0, 1.0] * (embedding_dim // 2)),
+            8: torch.Tensor([-1.0, 0.0] * (embedding_dim // 2)),
+            16: torch.Tensor([0.0, -1.0] * (embedding_dim // 2)),
+            32: torch.Tensor([-1.0, -1.0] * (embedding_dim // 2)),
+            64: torch.Tensor([0.0, 0.0] * (embedding_dim // 2)),
+        }
+        for p in self.parameters():
+            p.requires_grad = False
+    def forward(self, condition, batchsize):
+        """
+        Returns the conditional token for the given condition.
+        """
+        if condition not in self.pooling_factor_tokens.keys():
+            raise ValueError(f"Unsupported condition: {condition}")
+        batched_token = self.pooling_factor_tokens[condition][None, None].expand(
+            batchsize, 1, self.embedding_dim
+        )
+        return batched_token
+class AudioMAEConditionCTPoolRandV2(nn.Module):
+    """
+    audiomae = AudioMAEConditionCTPool2x2()
+    data = torch.randn((4, 1024, 128))
+    output = audiomae(data)
+    import ipdb;ipdb.set_trace()
+    exit(0)
+    """
+    def __init__(
+        self,
+        time_pooling_factors=[1, 2, 4, 8],
+        freq_pooling_factors=[1, 2, 4, 8],
+        eval_time_pooling=None,
+        eval_freq_pooling=None,
+        mask_ratio=0.0,
+        regularization=False,
+        no_audiomae_mask=True,
+        no_audiomae_average=False,
+    ):
+        super().__init__()
+        self.device = None
+        self.time_pooling_factors = time_pooling_factors
+        self.freq_pooling_factors = freq_pooling_factors
+        self.no_audiomae_mask = no_audiomae_mask
+        self.no_audiomae_average = no_audiomae_average
+        self.eval_freq_pooling = eval_freq_pooling
+        self.eval_time_pooling = eval_time_pooling
+        self.mask_ratio = mask_ratio
+        self.use_reg = regularization
+        self.pooling_tokens = ConditionalToken(768)
+        self.audiomae = Vanilla_AudioMAE()
+        self.audiomae.eval()
+        for p in self.audiomae.parameters():
+            p.requires_grad = False
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        param = next(self.audiomae.parameters())
+        assert param.requires_grad == False
+        device = param.device
+        # time_pool, freq_pool = max(self.time_pooling_factors), max(self.freq_pooling_factors)
+        time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+            self.eval_freq_pooling, 8
+        )
+        # time_pool = self.time_pooling_factors[np.random.choice(list(range(len(self.time_pooling_factors))))]
+        # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
+        pool_condition_token = self.pooling_tokens(time_pool, batchsize).to(device)
+        token_num = int(512 / (time_pool * freq_pool))
+        rep = torch.zeros((batchsize, token_num, 768)).to(device).float()
+        rep = torch.cat([rep, pool_condition_token], dim=1)
+        return [rep, torch.ones((batchsize, token_num + 1)).to(device).float()]
+    def pool(self, representation, time_pool=None, freq_pool=None):
+        assert representation.size(-1) == 768
+        representation = representation[:, 1:, :].transpose(1, 2)
+        bs, embedding_dim, token_num = representation.size()
+        representation = representation.reshape(bs, embedding_dim, 64, 8)
+        if self.training:
+            if time_pool is None and freq_pool is None:
+                time_pool = min(
+                    64,
+                    self.time_pooling_factors[
+                        np.random.choice(list(range(len(self.time_pooling_factors))))
+                    ],
+                )
+                # freq_pool = self.freq_pooling_factors[np.random.choice(list(range(len(self.freq_pooling_factors))))]
+                freq_pool = min(8, time_pool)  # TODO here I make some modification.
+        else:
+            time_pool, freq_pool = min(self.eval_time_pooling, 64), min(
+                self.eval_freq_pooling, 8
+            )
+        self.avgpooling = nn.AvgPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        self.maxpooling = nn.MaxPool2d(
+            kernel_size=(time_pool, freq_pool), stride=(time_pool, freq_pool)
+        )
+        pooled = (
+            self.avgpooling(representation) + self.maxpooling(representation)
+        ) / 2  # [bs, embedding_dim, time_token_num, freq_token_num]
+        pooled = pooled.flatten(2).transpose(1, 2)
+        return pooled, time_pool, freq_pool  # [bs, token_num, embedding_dim]
+    def regularization(self, x):
+        assert x.size(-1) == 768
+        x = F.normalize(x, p=2, dim=-1)
+        return x
+    # Required
+    def forward(self, batch):
+        assert batch.size(-2) == 1024 and batch.size(-1) == 128
+        if self.device is None:
+            self.device = batch.device
+        batch = batch.unsqueeze(1)
+        with torch.no_grad():
+            representation = self.audiomae(
+                batch,
+                mask_ratio=self.mask_ratio,
+                no_mask=self.no_audiomae_mask,
+                no_average=self.no_audiomae_average,
+            )
+            representation, time_pool, freq_pool = self.pool(representation)
+            if self.use_reg:
+                representation = self.regularization(representation)
+            pool_condition_token = self.pooling_tokens(
+                time_pool, representation.size(0)
+            ).to(representation.device)
+            representation = torch.cat([representation, pool_condition_token], dim=1)
+            return [
+                representation,
+                torch.ones((representation.size(0), representation.size(1)))
+                .to(representation.device)
+                .float(),
+            ]
+class BeatDownbeatConditionConcat(nn.Module):
+    def __init__(self, latent_t_size, latent_f_size):
+        super().__init__()
+        self.latent_t_size = latent_t_size
+        self.latent_f_size = latent_f_size
+        self.device = None
+    # Required
+    def get_unconditional_condition(self, batchsize):
+        return torch.zeros((batchsize, self.latent_t_size, self.latent_f_size)).to(
+            self.device
+        )
+    # Required
+    def forward(self, batch):
+        if self.device is None:
+            self.device = batch.device
+        return batch
+class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
+    def __init__(
+        self,
+        pretrained_path,
+        sampling_rate=16000,
+        embed_mode="audio",
+        amodel="HTSAT-base",
+        unconditional_prob=0.1,
+        random_mute=False,
+        max_random_mute_portion=0.5,
+        training_mode=True,
+    ):
+        super().__init__()
+        self.device = "cpu"
+        self.precision = "fp32"
+        self.amodel = amodel  # or 'PANN-14'
+        self.tmodel = "roberta"  # the best text encoder in our training
+        self.enable_fusion = False  # False if you do not want to use the fusion model
+        self.fusion_type = "aff_2d"
+        self.pretrained = pretrained_path
+        self.embed_mode = embed_mode
+        self.embed_mode_orig = embed_mode
+        self.sampling_rate = sampling_rate
+        self.unconditional_prob = unconditional_prob
+        self.random_mute = random_mute
+        self.tokenize = RobertaTokenizer.from_pretrained(config_data["roberta-base"])
+        self.max_random_mute_portion = max_random_mute_portion
+        self.training_mode = training_mode
+        self.model, self.model_cfg = create_model(
+            self.amodel,
+            self.tmodel,
+            self.pretrained,
+            precision=self.precision,
+            device=self.device,
+            enable_fusion=self.enable_fusion,
+            fusion_type=self.fusion_type,
+        )
+        audio_cfg = self.model_cfg["audio_cfg"]
+        self.mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=audio_cfg["sample_rate"],
+            n_fft=audio_cfg["window_size"],
+            win_length=audio_cfg["window_size"],
+            hop_length=audio_cfg["hop_size"],
+            center=True,
+            pad_mode="reflect",
+            power=2.0,
+            norm=None,
+            onesided=True,
+            n_mels=64,
+            f_min=audio_cfg["fmin"],
+            f_max=audio_cfg["fmax"],
+        )
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.unconditional_token = None
+        self.model.eval()
+    def get_unconditional_condition(self, batchsize):
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
+    def batch_to_list(self, batch):
+        ret = []
+        for i in range(batch.size(0)):
+            ret.append(batch[i])
+        return ret
+    def make_decision(self, probability):
+        if float(torch.rand(1)) < probability:
+            return True
+        else:
+            return False
+    def random_uniform(self, start, end):
+        val = torch.rand(1).item()
+        return start + (end - start) * val
+    def _random_mute(self, waveform):
+        # waveform: [bs, t-steps]
+        t_steps = waveform.size(-1)
+        for i in range(waveform.size(0)):
+            mute_size = int(
+                self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
+            )
+            mute_start = int(self.random_uniform(0, t_steps - mute_size))
+            waveform[i, mute_start : mute_start + mute_size] = 0
+        return waveform
+    def cos_similarity(self, waveform, text):
+        # waveform: [bs, t_steps]
+        original_embed_mode = self.embed_mode
+        with torch.no_grad():
+            self.embed_mode = "audio"
+            audio_emb = self(waveform.cuda())
+            self.embed_mode = "text"
+            text_emb = self(text)
+            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2)
+        self.embed_mode = original_embed_mode
+        return similarity.squeeze()
+    def build_unconditional_emb(self):
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+    def forward(self, batch):
+        # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
+        # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
+        if self.model.training == True and not self.training_mode:
+            print(
+                "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
+            )
+            self.model, self.model_cfg = create_model(
+                self.amodel,
+                self.tmodel,
+                self.pretrained,
+                precision=self.precision,
+                device="cuda",
+                enable_fusion=self.enable_fusion,
+                fusion_type=self.fusion_type,
+            )
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.model.eval()
+        if self.unconditional_token is None:
+            self.build_unconditional_emb()
+        # if(self.training_mode):
+        #     assert self.model.training == True
+        # else:
+        #     assert self.model.training == False
+        # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+        if self.embed_mode == "audio":
+            if not self.training:
+                print("INFO: clap model calculate the audio embedding as condition")
+            with torch.no_grad():
+                # assert (
+                #     self.sampling_rate == 16000
+                # ), "We only support 16000 sampling rate"
+                # if self.random_mute:
+                #     batch = self._random_mute(batch)
+                # batch: [bs, 1, t-samples]
+                if self.sampling_rate != 48000:
+                    batch = torchaudio.functional.resample(
+                        batch, orig_freq=self.sampling_rate, new_freq=48000
+                    )
+                audio_data = batch.squeeze(1)
+                mel = self.mel_transform(audio_data)
+                audio_dict = get_audio_features(
+                    audio_data,
+                    mel,
+                    480000,
+                    data_truncating="fusion",
+                    data_filling="repeatpad",
+                    audio_cfg=self.model_cfg["audio_cfg"],
+                )
+                # [bs, 512]
+                embed = self.model.get_audio_embedding(audio_dict)
+        elif self.embed_mode == "text":
+            with torch.no_grad():
+                # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+                text_data = self.tokenizer(batch)
+                if isinstance(batch, str) or (
+                    isinstance(batch, list) and len(batch) == 1
+                ):
+                    for key in text_data.keys():
+                        text_data[key] = text_data[key].unsqueeze(0)
+                embed = self.model.get_text_embedding(text_data)
+        embed = embed.unsqueeze(1)
+        for i in range(embed.size(0)):
+            if self.make_decision(self.unconditional_prob):
+                embed[i] = self.unconditional_token
+        # embed = torch.randn((batch.size(0), 1, 512)).type_as(batch)
+        return embed.detach()
+    def tokenizer(self, text):
+        result = self.tokenize(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}
+if __name__ == "__main__":
+    model = CLAPAudioEmbeddingClassifierFreev2(
+        pretrained_path="/mnt/bn/lqhaoheliu/exps/checkpoints/audioldm/ckpt/CLAP.pt",
+        embed_mode="text",
+        amodel="HTSAT-tiny",
+    )
+    # data = torch.randn((6, 1, int(16000*10.24)))
+    data = ["text", "text"]
+    res = model(data)
+    import ipdb
+    ipdb.set_trace()

qa_mdt/audioldm_train/config/mos_as_token/qa_mdt.yaml ADDED Viewed

	@@ -0,0 +1,169 @@

+log_directory: "./log/latent_diffusion"
+project: "audioldm"
+precision: "high"
+# TODO: change this with your project path
+base_root: "./qa_mdt"
+# TODO: change this with your pretrained path
+# TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
+pretrained:
+ clap_music: "./qa_mdt/checkpoints/clap_music"
+ flan_t5: "./qa_mdt/checkpoints/flant5"
+ hifi-gan: "./qa_mdt/checkpoints/hifi-gan/checkpoints"
+ roberta-base: "./qa_mdt/checkpoints/robertabase"
+# TODO: lmdb dataset that stores pMOS of the training dataset
+# while in inference, we don't need it !!!
+# while in inference, we don't need it !!!
+# while in inference, we don't need it !!!
+mos_path: ""
+train_path:
+  train_lmdb_path: [] # path list of training lmdb folders
+val_path:
+  val_lmdb_path: [] # path list of training lmdb folders
+  val_key_path: [] #  path list of training lmdb key files
+variables:
+  sampling_rate: &sampling_rate 16000
+  mel_bins: &mel_bins 64
+  latent_embed_dim: &latent_embed_dim 8
+  latent_t_size: &latent_t_size 256 # TODO might need to change
+  latent_f_size: &latent_f_size 16 # TODO might need to change
+  in_channels: &unet_in_channels 8 # TODO might need to change
+  optimize_ddpm_parameter: &optimize_ddpm_parameter true
+  optimize_gpt: &optimize_gpt true
+  warmup_steps: &warmup_steps 2000
+# we rewrite the dataset so it may not be needed
+data:
+  train: ["audiocaps"]
+  val: "audiocaps"
+  test: "audiocaps"
+  class_label_indices: "audioset_eval_subset"
+  dataloader_add_ons: ["waveform_rs_48k"]
+step:
+  validation_every_n_epochs: 10000
+  save_checkpoint_every_n_steps: 1000
+  # limit_val_batches: 2
+  max_steps: 8000000
+  save_top_k: 1000
+preprocessing:
+  audio:
+    sampling_rate: *sampling_rate
+    max_wav_value: 32768.0
+    duration: 10.24
+  stft:
+    filter_length: 1024
+    hop_length: 160
+    win_length: 1024
+  mel:
+    n_mel_channels: *mel_bins
+    mel_fmin: 0
+    mel_fmax: 8000
+augmentation:
+  mixup: 0.0
+model:
+  target: qa_mdt.audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
+  params:
+    # Autoencoder
+    first_stage_config:
+      base_learning_rate: 8.0e-06
+      target: qa_mdt.audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
+      params:
+        # TODO: change it with your VAE checkpoint
+        reload_from_ckpt: "./qa_mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
+        sampling_rate: *sampling_rate
+        batchsize: 1
+        monitor: val/rec_loss
+        image_key: fbank
+        subband: 1
+        embed_dim: *latent_embed_dim
+        time_shuffle: 1
+        lossconfig:
+          target: qa_mdt.audioldm_train.losses.LPIPSWithDiscriminator
+          params:
+            disc_start: 50001
+            kl_weight: 1000.0
+            disc_weight: 0.5
+            disc_in_channels: 1
+        ddconfig:
+          double_z: true
+          mel_bins: *mel_bins
+          z_channels: 8
+          resolution: 256
+          downsample_time: false
+          in_channels: 1
+          out_ch: 1
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+    # Other parameters
+    base_learning_rate: 8.0e-5
+    warmup_steps: *warmup_steps
+    optimize_ddpm_parameter: *optimize_ddpm_parameter
+    sampling_rate: *sampling_rate
+    batchsize: 16
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    unconditional_prob_cfg: 0.1
+    parameterization: eps # [eps, x0, v]
+    first_stage_key: fbank
+    latent_t_size: *latent_t_size
+    latent_f_size: *latent_f_size
+    channels: *latent_embed_dim
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    unet_config:
+      # TODO: choose your class, Default: MDT_MOS_AS_TOKEN
+      # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
+      target: qa_mdt.audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
+      params:
+        input_size : [256, 16]
+      # patch_size: [16,4]
+        patch_size : [4, 1]
+        overlap_size: [0, 0]
+        in_channels : 8
+        hidden_size : 1152
+        depth : 28
+        num_heads : 16
+        mlp_ratio : 4.0
+        class_dropout_prob : 0.1
+        pred_sigma : True
+        drop_path : 0.
+        window_size : 0
+        window_block_indexes : None
+        use_rel_pos : False
+        cond_dim : 1024
+        lewei_scale : 1.0
+        overlap: [0, 0]
+        use_cfg: true
+        mask_ratio: 0.30
+        decode_layer: 8
+    cond_stage_config:
+      crossattn_flan_t5:
+        cond_stage_key: text
+        conditioning_key: crossattn
+        target: qa_mdt.audioldm_train.conditional_models.FlanT5HiddenState
+    evaluation_params:
+      unconditional_guidance_scale: 3.5
+      ddim_sampling_steps: 200
+      n_candidates_per_samples: 3

qa_mdt/audioldm_train/dataset_plugin.py ADDED Viewed

	@@ -0,0 +1,508 @@

+import os
+import torch
+import numpy as np
+import torchaudio
+import matplotlib.pyplot as plt
+CACHE = {
+    "get_vits_phoneme_ids": {
+        "PAD_LENGTH": 310,
+        "_pad": "_",
+        "_punctuation": ';:,.!?¡¿—…"«»“” ',
+        "_letters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+        "_letters_ipa": "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ",
+        "_special": "♪☎☒☝⚠",
+    }
+}
+CACHE["get_vits_phoneme_ids"]["symbols"] = (
+    [CACHE["get_vits_phoneme_ids"]["_pad"]]
+    + list(CACHE["get_vits_phoneme_ids"]["_punctuation"])
+    + list(CACHE["get_vits_phoneme_ids"]["_letters"])
+    + list(CACHE["get_vits_phoneme_ids"]["_letters_ipa"])
+    + list(CACHE["get_vits_phoneme_ids"]["_special"])
+)
+CACHE["get_vits_phoneme_ids"]["_symbol_to_id"] = {
+    s: i for i, s in enumerate(CACHE["get_vits_phoneme_ids"]["symbols"])
+}
+def get_vits_phoneme_ids(config, dl_output, metadata):
+    pad_token_id = 0
+    pad_length = CACHE["get_vits_phoneme_ids"]["PAD_LENGTH"]
+    _symbol_to_id = CACHE["get_vits_phoneme_ids"]["_symbol_to_id"]
+    assert (
+        "phonemes" in metadata.keys()
+    ), "You must provide vits phonemes on using addon get_vits_phoneme_ids"
+    clean_text = metadata["phonemes"]
+    sequence = []
+    for symbol in clean_text:
+        symbol_id = _symbol_to_id[symbol]
+        sequence += [symbol_id]
+    inserted_zero_sequence = [0] * (len(sequence) * 2)
+    inserted_zero_sequence[1::2] = sequence
+    inserted_zero_sequence = inserted_zero_sequence + [0]
+    def _pad_phonemes(phonemes_list):
+        return phonemes_list + [pad_token_id] * (pad_length - len(phonemes_list))
+    return {"phoneme_idx": torch.LongTensor(_pad_phonemes(inserted_zero_sequence))}
+def get_vits_phoneme_ids_no_padding(config, dl_output, metadata):
+    pad_token_id = 0
+    pad_length = CACHE["get_vits_phoneme_ids"]["PAD_LENGTH"]
+    _symbol_to_id = CACHE["get_vits_phoneme_ids"]["_symbol_to_id"]
+    assert (
+        "phonemes" in metadata.keys()
+    ), "You must provide vits phonemes on using addon get_vits_phoneme_ids"
+    clean_text = metadata["phonemes"] + "⚠"
+    sequence = []
+    for symbol in clean_text:
+        if symbol not in _symbol_to_id.keys():
+            print("%s is not in the vocabulary. %s" % (symbol, clean_text))
+            symbol = "_"
+        symbol_id = _symbol_to_id[symbol]
+        sequence += [symbol_id]
+    def _pad_phonemes(phonemes_list):
+        return phonemes_list + [pad_token_id] * (pad_length - len(phonemes_list))
+    sequence = sequence[:pad_length]
+    return {"phoneme_idx": torch.LongTensor(_pad_phonemes(sequence))}
+def calculate_relative_bandwidth(config, dl_output, metadata):
+    assert "stft" in dl_output.keys()
+    # The last dimension of the stft feature is the frequency dimension
+    freq_dimensions = dl_output["stft"].size(-1)
+    freq_energy_dist = torch.sum(dl_output["stft"], dim=0)
+    freq_energy_dist = torch.cumsum(freq_energy_dist, dim=0)
+    total_energy = freq_energy_dist[-1]
+    percentile_5th = total_energy * 0.05
+    percentile_95th = total_energy * 0.95
+    lower_idx = torch.argmin(torch.abs(percentile_5th - freq_energy_dist))
+    higher_idx = torch.argmin(torch.abs(percentile_95th - freq_energy_dist))
+    lower_idx = int((lower_idx / freq_dimensions) * 1000)
+    higher_idx = int((higher_idx / freq_dimensions) * 1000)
+    return {"freq_energy_percentile": torch.LongTensor([lower_idx, higher_idx])}
+def calculate_mel_spec_relative_bandwidth_as_extra_channel(config, dl_output, metadata):
+    assert "stft" in dl_output.keys()
+    linear_mel_spec = torch.exp(torch.clip(dl_output["log_mel_spec"], max=10))
+    # The last dimension of the stft feature is the frequency dimension
+    freq_dimensions = linear_mel_spec.size(-1)
+    freq_energy_dist = torch.sum(linear_mel_spec, dim=0)
+    freq_energy_dist = torch.cumsum(freq_energy_dist, dim=0)
+    total_energy = freq_energy_dist[-1]
+    percentile_5th = total_energy * 0.05
+    percentile_95th = total_energy * 0.95
+    lower_idx = torch.argmin(torch.abs(percentile_5th - freq_energy_dist))
+    higher_idx = torch.argmin(torch.abs(percentile_95th - freq_energy_dist))
+    latent_t_size = config["model"]["params"]["latent_t_size"]
+    latent_f_size = config["model"]["params"]["latent_f_size"]
+    lower_idx = int(latent_f_size * float((lower_idx / freq_dimensions)))
+    higher_idx = int(latent_f_size * float((higher_idx / freq_dimensions)))
+    bandwidth_condition = torch.zeros((latent_t_size, latent_f_size))
+    bandwidth_condition[:, lower_idx:higher_idx] += 1.0
+    return {
+        "mel_spec_bandwidth_cond_extra_channel": bandwidth_condition,
+        "freq_energy_percentile": torch.LongTensor([lower_idx, higher_idx]),
+    }
+def waveform_rs_48k(config, dl_output, metadata):
+    waveform = dl_output["waveform"]  # [1, samples]
+    sampling_rate = dl_output["sampling_rate"]
+    if sampling_rate != 48000:
+        waveform_48k = torchaudio.functional.resample(
+            waveform, orig_freq=sampling_rate, new_freq=48000
+        )
+    else:
+        waveform_48k = waveform
+    return {"waveform_48k": waveform_48k}
+def extract_vits_phoneme_and_flant5_text(config, dl_output, metadata):
+    assert (
+        "phoneme" not in metadata.keys()
+    ), "The metadata of speech you use seems belong to fastspeech. Please check dataset_root.json"
+    if "phonemes" in metadata.keys():
+        new_item = get_vits_phoneme_ids_no_padding(config, dl_output, metadata)
+        new_item["text"] = ""  # We assume TTS data does not have text description
+    else:
+        fake_metadata = {"phonemes": ""}  # Add empty phoneme sequence
+        new_item = get_vits_phoneme_ids_no_padding(config, dl_output, fake_metadata)
+    return new_item
+def extract_fs2_phoneme_and_flant5_text(config, dl_output, metadata):
+    if "phoneme" in metadata.keys():
+        new_item = extract_fs2_phoneme_g2p_en_feature(config, dl_output, metadata)
+        new_item["text"] = ""
+    else:
+        fake_metadata = {"phoneme": []}
+        new_item = extract_fs2_phoneme_g2p_en_feature(config, dl_output, fake_metadata)
+    return new_item
+def extract_fs2_phoneme_g2p_en_feature(config, dl_output, metadata):
+    PAD_LENGTH = 135
+    phonemes_lookup_dict = {
+        "K": 0,
+        "IH2": 1,
+        "NG": 2,
+        "OW2": 3,
+        "AH2": 4,
+        "F": 5,
+        "AE0": 6,
+        "IY0": 7,
+        "SH": 8,
+        "G": 9,
+        "W": 10,
+        "UW1": 11,
+        "AO2": 12,
+        "AW2": 13,
+        "UW0": 14,
+        "EY2": 15,
+        "UW2": 16,
+        "AE2": 17,
+        "IH0": 18,
+        "P": 19,
+        "D": 20,
+        "ER1": 21,
+        "AA1": 22,
+        "EH0": 23,
+        "UH1": 24,
+        "N": 25,
+        "V": 26,
+        "AY1": 27,
+        "EY1": 28,
+        "UH2": 29,
+        "EH1": 30,
+        "L": 31,
+        "AA2": 32,
+        "R": 33,
+        "OY1": 34,
+        "Y": 35,
+        "ER2": 36,
+        "S": 37,
+        "AE1": 38,
+        "AH1": 39,
+        "JH": 40,
+        "ER0": 41,
+        "EH2": 42,
+        "IY2": 43,
+        "OY2": 44,
+        "AW1": 45,
+        "IH1": 46,
+        "IY1": 47,
+        "OW0": 48,
+        "AO0": 49,
+        "AY0": 50,
+        "EY0": 51,
+        "AY2": 52,
+        "UH0": 53,
+        "M": 54,
+        "TH": 55,
+        "T": 56,
+        "OY0": 57,
+        "AW0": 58,
+        "DH": 59,
+        "Z": 60,
+        "spn": 61,
+        "AH0": 62,
+        "sp": 63,
+        "AO1": 64,
+        "OW1": 65,
+        "ZH": 66,
+        "B": 67,
+        "AA0": 68,
+        "CH": 69,
+        "HH": 70,
+    }
+    pad_token_id = len(phonemes_lookup_dict.keys())
+    assert (
+        "phoneme" in metadata.keys()
+    ), "The dataloader add-on extract_phoneme_g2p_en_feature will output phoneme id, which is not specified in your dataset"
+    phonemes = [
+        phonemes_lookup_dict[x]
+        for x in metadata["phoneme"]
+        if (x in phonemes_lookup_dict.keys())
+    ]
+    if (len(phonemes) / PAD_LENGTH) > 5:
+        print(
+            "Warning: Phonemes length is too long and is truncated too much! %s"
+            % metadata
+        )
+    phonemes = phonemes[:PAD_LENGTH]
+    def _pad_phonemes(phonemes_list):
+        return phonemes_list + [pad_token_id] * (PAD_LENGTH - len(phonemes_list))
+    return {"phoneme_idx": torch.LongTensor(_pad_phonemes(phonemes))}
+def extract_phoneme_g2p_en_feature(config, dl_output, metadata):
+    PAD_LENGTH = 250
+    phonemes_lookup_dict = {
+        " ": 0,
+        "AA": 1,
+        "AE": 2,
+        "AH": 3,
+        "AO": 4,
+        "AW": 5,
+        "AY": 6,
+        "B": 7,
+        "CH": 8,
+        "D": 9,
+        "DH": 10,
+        "EH": 11,
+        "ER": 12,
+        "EY": 13,
+        "F": 14,
+        "G": 15,
+        "HH": 16,
+        "IH": 17,
+        "IY": 18,
+        "JH": 19,
+        "K": 20,
+        "L": 21,
+        "M": 22,
+        "N": 23,
+        "NG": 24,
+        "OW": 25,
+        "OY": 26,
+        "P": 27,
+        "R": 28,
+        "S": 29,
+        "SH": 30,
+        "T": 31,
+        "TH": 32,
+        "UH": 33,
+        "UW": 34,
+        "V": 35,
+        "W": 36,
+        "Y": 37,
+        "Z": 38,
+        "ZH": 39,
+    }
+    pad_token_id = len(phonemes_lookup_dict.keys())
+    assert (
+        "phoneme" in metadata.keys()
+    ), "The dataloader add-on extract_phoneme_g2p_en_feature will output phoneme id, which is not specified in your dataset"
+    phonemes = [
+        phonemes_lookup_dict[x]
+        for x in metadata["phoneme"]
+        if (x in phonemes_lookup_dict.keys())
+    ]
+    if (len(phonemes) / PAD_LENGTH) > 5:
+        print(
+            "Warning: Phonemes length is too long and is truncated too much! %s"
+            % metadata
+        )
+    phonemes = phonemes[:PAD_LENGTH]
+    def _pad_phonemes(phonemes_list):
+        return phonemes_list + [pad_token_id] * (PAD_LENGTH - len(phonemes_list))
+    return {"phoneme_idx": torch.LongTensor(_pad_phonemes(phonemes))}
+def extract_kaldi_fbank_feature(config, dl_output, metadata):
+    norm_mean = -4.2677393
+    norm_std = 4.5689974
+    waveform = dl_output["waveform"]  # [1, samples]
+    sampling_rate = dl_output["sampling_rate"]
+    log_mel_spec_hifigan = dl_output["log_mel_spec"]
+    if sampling_rate != 16000:
+        waveform_16k = torchaudio.functional.resample(
+            waveform, orig_freq=sampling_rate, new_freq=16000
+        )
+    else:
+        waveform_16k = waveform
+    waveform_16k = waveform_16k - waveform_16k.mean()
+    fbank = torchaudio.compliance.kaldi.fbank(
+        waveform_16k,
+        htk_compat=True,
+        sample_frequency=16000,
+        use_energy=False,
+        window_type="hanning",
+        num_mel_bins=128,
+        dither=0.0,
+        frame_shift=10,
+    )
+    TARGET_LEN = log_mel_spec_hifigan.size(0)
+    # cut and pad
+    n_frames = fbank.shape[0]
+    p = TARGET_LEN - n_frames
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[:TARGET_LEN, :]
+    fbank = (fbank - norm_mean) / (norm_std * 2)
+    return {"ta_kaldi_fbank": fbank}  # [1024, 128]
+def extract_kaldi_fbank_feature_32k(config, dl_output, metadata):
+    norm_mean = -4.2677393
+    norm_std = 4.5689974
+    waveform = dl_output["waveform"]  # [1, samples]
+    sampling_rate = dl_output["sampling_rate"]
+    log_mel_spec_hifigan = dl_output["log_mel_spec"]
+    if sampling_rate != 32000:
+        waveform_32k = torchaudio.functional.resample(
+            waveform, orig_freq=sampling_rate, new_freq=32000
+        )
+    else:
+        waveform_32k = waveform
+    waveform_32k = waveform_32k - waveform_32k.mean()
+    fbank = torchaudio.compliance.kaldi.fbank(
+        waveform_32k,
+        htk_compat=True,
+        sample_frequency=32000,
+        use_energy=False,
+        window_type="hanning",
+        num_mel_bins=128,
+        dither=0.0,
+        frame_shift=10,
+    )
+    TARGET_LEN = log_mel_spec_hifigan.size(0)
+    # cut and pad
+    n_frames = fbank.shape[0]
+    p = TARGET_LEN - n_frames
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[:TARGET_LEN, :]
+    fbank = (fbank - norm_mean) / (norm_std * 2)
+    return {"ta_kaldi_fbank": fbank}  # [1024, 128]
+# Use the beat and downbeat information as music conditions
+def extract_drum_beat(config, dl_output, metadata):
+    def visualization(conditional_signal, mel_spectrogram, filename):
+        import soundfile as sf
+        sf.write(
+            os.path.basename(dl_output["fname"]),
+            np.array(dl_output["waveform"])[0],
+            dl_output["sampling_rate"],
+        )
+        plt.figure(figsize=(10, 10))
+        plt.subplot(211)
+        plt.imshow(np.array(conditional_signal).T, aspect="auto")
+        plt.title("Conditional Signal")
+        plt.subplot(212)
+        plt.imshow(np.array(mel_spectrogram).T, aspect="auto")
+        plt.title("Mel Spectrogram")
+        plt.savefig(filename)
+        plt.close()
+    assert "sample_rate" in metadata and "beat" in metadata and "downbeat" in metadata
+    sampling_rate = metadata["sample_rate"]
+    duration = dl_output["duration"]
+    # The dataloader segment length before performing torch resampling
+    original_segment_length_before_resample = int(sampling_rate * duration)
+    random_start_sample = int(dl_output["random_start_sample_in_original_audio_file"])
+    # The sample idx for beat and downbeat, relatively to the segmented audio
+    beat = [
+        x - random_start_sample
+        for x in metadata["beat"]
+        if (
+            x - random_start_sample >= 0
+            and x - random_start_sample <= original_segment_length_before_resample
+        )
+    ]
+    downbeat = [
+        x - random_start_sample
+        for x in metadata["downbeat"]
+        if (
+            x - random_start_sample >= 0
+            and x - random_start_sample <= original_segment_length_before_resample
+        )
+    ]
+    latent_shape = (
+        config["model"]["params"]["latent_t_size"],
+        config["model"]["params"]["latent_f_size"],
+    )
+    conditional_signal = torch.zeros(latent_shape)
+    # beat: -0.5
+    # downbeat: +1.0
+    # 0: none; -0.5: beat; 1.0: downbeat; 0.5: downbeat+beat
+    for each in beat:
+        beat_index = int(
+            (each / original_segment_length_before_resample) * latent_shape[0]
+        )
+        beat_index = min(beat_index, conditional_signal.size(0) - 1)
+        conditional_signal[beat_index, :] -= 0.5
+    for each in downbeat:
+        beat_index = int(
+            (each / original_segment_length_before_resample) * latent_shape[0]
+        )
+        beat_index = min(beat_index, conditional_signal.size(0) - 1)
+        conditional_signal[beat_index, :] += 1.0
+    # visualization(conditional_signal, dl_output["log_mel_spec"], filename = os.path.basename(dl_output["fname"])+".png")
+    return {"cond_beat_downbeat": conditional_signal}

qa_mdt/audioldm_train/losses/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .contperceptual import LPIPSWithDiscriminator

qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (203 Bytes). View file

qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc ADDED Viewed

Binary file (3.66 kB). View file

qa_mdt/audioldm_train/losses/contperceptual.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+import torch.nn as nn
+import sys
+sys.path.append("/train20/intern/permanent/changli7/dataset_ptm")
+from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
+class LPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start,
+        logvar_init=0.0,
+        kl_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_loss="hinge",
+    ):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.kl_weight = kl_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
+        ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        inputs,
+        reconstructions,
+        posteriors,
+        optimizer_idx,
+        global_step,
+        waveform=None,
+        rec_waveform=None,
+        last_layer=None,
+        cond=None,
+        split="train",
+        weights=None,
+    ):
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        # Always true
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        kl_loss = posteriors.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous(), cond), dim=1)
+                )
+            g_loss = -torch.mean(logits_fake)
+            if self.disc_factor > 0.0:
+                try:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            loss = (
+                weighted_nll_loss
+                + self.kl_weight * kl_loss
+                + d_weight * disc_factor * g_loss
+            )
+            log = {
+                "{}/total_loss".format(split): loss.clone().detach().mean(),
+                "{}/logvar".format(split): self.logvar.detach(),
+                "{}/kl_loss".format(split): kl_loss.detach().mean(),
+                "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                "{}/d_weight".format(split): d_weight.detach(),
+                "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                "{}/g_loss".format(split): g_loss.detach().mean(),
+            }
+            return loss, log
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(
+                    torch.cat((inputs.contiguous().detach(), cond), dim=1)
+                )
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous().detach(), cond), dim=1)
+                )
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            log = {
+                "{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                "{}/logits_real".format(split): logits_real.detach().mean(),
+                "{}/logits_fake".format(split): logits_fake.detach().mean(),
+            }
+            return d_loss, log

qa_mdt/audioldm_train/modules/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

qa_mdt/audioldm_train/modules/__init__.py ADDED Viewed

File without changes

qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

qa_mdt/audioldm_train/modules/audiomae/AudioMAE.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Reference Repo: https://github.com/facebookresearch/AudioMAE
+"""
+import torch
+import torch.nn as nn
+from timm.models.layers import to_2tuple
+import qa_mdt.audioldm_train.modules.audiomae.models_vit as models_vit
+import qa_mdt.audioldm_train.modules.audiomae.models_mae as models_mae
+# model = mae_vit_base_patch16(in_chans=1, audio_exp=True, img_size=(1024, 128))
+class PatchEmbed_new(nn.Module):
+    """Flexible Image to Patch Embedding"""
+    def __init__(
+        self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, stride=10
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride
+        )  # with overlapped patches
+        # self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0])
+        # self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        _, _, h, w = self.get_output_shape(img_size)  # n, emb_dim, h, w
+        self.patch_hw = (h, w)
+        self.num_patches = h * w
+    def get_output_shape(self, img_size):
+        # todo: don't be lazy..
+        return self.proj(torch.randn(1, 1, img_size[0], img_size[1])).shape
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #    f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class AudioMAE(nn.Module):
+    """Audio Masked Autoencoder (MAE) pre-trained and finetuned on AudioSet (for SoundCLIP)"""
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        model = models_vit.__dict__["vit_base_patch16"](
+            num_classes=527,
+            drop_path_rate=0.1,
+            global_pool=True,
+            mask_2d=True,
+            use_custom_patch=False,
+        )
+        img_size = (1024, 128)
+        emb_dim = 768
+        model.patch_embed = PatchEmbed_new(
+            img_size=img_size,
+            patch_size=(16, 16),
+            in_chans=1,
+            embed_dim=emb_dim,
+            stride=16,
+        )
+        num_patches = model.patch_embed.num_patches
+        # num_patches = 512 # assume audioset, 1024//16=64, 128//16=8, 512=64x8
+        model.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, emb_dim), requires_grad=False
+        )  # fixed sin-cos embedding
+        checkpoint_path = (
+            "/mnt/bn/data-xubo/project/Masked_AudioEncoder/checkpoint/finetuned.pth"
+        )
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        msg = model.load_state_dict(checkpoint["model"], strict=False)
+        # print(f'Load AudioMAE from {checkpoint_path} / message: {msg}')
+        self.model = model
+    def forward(self, x, mask_t_prob=0.0, mask_f_prob=0.0):
+        """
+        x: mel fbank [Batch, 1, T, F]
+        mask_t_prob: 'T masking ratio (percentage of removed patches).'
+        mask_f_prob: 'F masking ratio (percentage of removed patches).'
+        """
+        return self.model(x=x, mask_t_prob=mask_t_prob, mask_f_prob=mask_f_prob)
+class Vanilla_AudioMAE(nn.Module):
+    """Audio Masked Autoencoder (MAE) pre-trained on AudioSet (for AudioLDM)"""
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        model = models_mae.__dict__["mae_vit_base_patch16"](
+            in_chans=1, audio_exp=True, img_size=(1024, 128)
+        )
+        checkpoint_path = "data/checkpoints/audiomae_16k_128bins.ckpt"
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        msg = model.load_state_dict(checkpoint["model"], strict=False)
+        # Skip the missing keys of decoder modules (not required)
+        # print(f'Load AudioMAE from {checkpoint_path} / message: {msg}')
+        self.model = model.eval()
+    def forward(self, x, mask_ratio=0.0, no_mask=False, no_average=False):
+        """
+        x: mel fbank [Batch, 1, 1024 (T), 128 (F)]
+        mask_ratio: 'masking ratio (percentage of removed patches).'
+        """
+        with torch.no_grad():
+            # embed: [B, 513, 768] for mask_ratio=0.0
+            if no_mask:
+                if no_average:
+                    raise RuntimeError("This function is deprecated")
+                    embed = self.model.forward_encoder_no_random_mask_no_average(
+                        x
+                    )  # mask_ratio
+                else:
+                    embed = self.model.forward_encoder_no_mask(x)  # mask_ratio
+            else:
+                raise RuntimeError("This function is deprecated")
+                embed, _, _, _ = self.model.forward_encoder(x, mask_ratio=mask_ratio)
+        return embed
+if __name__ == "__main__":
+    model = Vanilla_AudioMAE().cuda()
+    input = torch.randn(4, 1, 1024, 128).cuda()
+    print("The first run")
+    embed = model(input, mask_ratio=0.0, no_mask=True)
+    print(embed)
+    print("The second run")
+    embed = model(input, mask_ratio=0.0)
+    print(embed)

qa_mdt/audioldm_train/modules/audiomae/README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# A simple use of Audio Masked AutoEncoder (AudioMAE)
+Reference code: https://github.com/facebookresearch/AudioMAE
+Paper: https://arxiv.org/abs/2207.06405
+Install the required python packages:
+```
+pip install -r requirments.txt
+```
+See the usage in example.py
+  ```
+  python example.py
+  """
+  Load AudioMAE from /mnt/bn/data-xubo/project/Masked_AudioEncoder checkpoint/finetuned.pth / message: <All keys matched successfully>
+  Start evaluation on AudioSet ...
+  mAP: 0.463003
+  """
+  ```

qa_mdt/audioldm_train/modules/audiomae/__init__.py ADDED Viewed

File without changes

qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (152 Bytes). View file

qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc ADDED Viewed

Binary file (5.18 kB). View file

qa_mdt/audioldm_train/modules/audiomae/audiovisual_dataset.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import json
+import random
+from tqdm import tqdm
+import torch
+import decord
+decord.bridge.set_bridge("torch")
+import torchaudio
+from math import ceil
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+import numpy as np
+class AudioVisualDataset(Dataset):
+    """Can sample data from audio-visual databases
+    Params:
+    min_video_frames: used to drop short video clips
+    video_resize: resize for CLIP processing
+    sampling_rate: audio sampling rate
+    max_clip_len: max length (seconds) of audiovisual clip to be sampled
+    num_sample_frames: number of image frames to be uniformly sampled from video
+    """
+    def __init__(
+        self,
+        datafiles=[
+            "/mnt/bn/data-xubo/dataset/audioset_videos/datafiles/audioset_balanced_train.json",
+        ],
+        min_video_frames=30,
+        video_resize=[224, 224],
+        sampling_rate=16000,
+        sample_av_clip=True,
+        max_clip_len=10,
+        num_sample_frames=10,
+        # hyparameters used for SpecAug
+        freqm=48,
+        timem=192,
+        return_label=False,
+    ):
+        all_data_json = []
+        for datafile in datafiles:
+            with open(datafile, "r") as fp:
+                data_json = json.load(fp)["data"]
+                all_data_json.extend(data_json)
+        # drop short video clips
+        self.all_data_json = [
+            data
+            for data in all_data_json
+            if int(data["video_shape"][0]) >= min_video_frames
+        ]
+        self.max_clip_len = max_clip_len
+        self.video_resize = video_resize
+        self.sampling_rate = sampling_rate
+        self.sample_av_clip = sample_av_clip
+        self.num_sample_frames = num_sample_frames
+        self.corresponding_audio_len = self.sampling_rate * self.max_clip_len
+        # hyparameters used for AudioMAE
+        self.freqm = freqm
+        self.timem = timem
+        self.norm_mean = -4.2677393
+        self.norm_std = 4.5689974
+        self.melbins = 128
+        self.TARGET_LEN = 1024
+        self.return_label = return_label
+        if self.return_label:
+            self.audioset_label2idx = self._prepare_audioset()
+    def __len__(self):
+        return len(self.all_data_json)
+    def _read_audio_video(self, index):
+        try:
+            video_path = self.all_data_json[index]["mp4"]
+            # read audio
+            ar = decord.AudioReader(
+                video_path, sample_rate=self.sampling_rate, mono=True
+            )
+            # read video frames
+            vr = decord.VideoReader(
+                video_path,
+                height=self.video_resize[0],
+                width=self.video_resize[1],
+            )
+            labels = self.all_data_json[index]["labels"]
+            return vr, ar, labels
+        except Exception as e:
+            print(f"error: {e} occurs, when loading {video_path}")
+            random_index = random.randint(0, len(self.all_data_json) - 1)
+            return self._read_audio_video(index=random_index)
+    def _prepare_audioset(self):
+        df1 = pd.read_csv(
+            "/mnt/bn/lqhaoheliu/datasets/audioset/metadata/class_labels_indices.csv",
+            delimiter=",",
+            skiprows=0,
+        )
+        label_set = df1.to_numpy()
+        code2id = {}
+        for i in range(len(label_set)):
+            code2id[label_set[i][1]] = label_set[i][0]
+        return code2id
+    def __getitem__(self, index):
+        # read audio and video
+        vr, ar, labels = self._read_audio_video(index)
+        # create a audio tensor
+        audio_data = ar[:]  # [1, samples]
+        audio_len = audio_data.shape[1] / self.sampling_rate
+        audio_data = audio_data.squeeze(0)  # [samples]
+        # create a video tensor
+        full_vid_length = len(vr)
+        video_rate = ceil(vr.get_avg_fps())
+        samples_per_frame = float(self.sampling_rate) / video_rate
+        start_frame = 0
+        # sample video clip
+        if audio_len > self.max_clip_len and self.sample_av_clip:
+            start_frame = random.randint(
+                0, max(full_vid_length - video_rate * self.max_clip_len, 0)
+            )
+        end_frame = min(start_frame + video_rate * self.max_clip_len, full_vid_length)
+        video_data = vr.get_batch(range(start_frame, end_frame))
+        # sample audio clip
+        if audio_len > self.max_clip_len and self.sample_av_clip:
+            # corresponding_audio_len = int(video_data.size()[0] * samples_per_frame)
+            corresponding_audio_start = int(start_frame * samples_per_frame)
+            audio_data = audio_data[corresponding_audio_start:]
+        # cut or pad audio clip with respect to the sampled video clip
+        if audio_data.shape[0] < self.corresponding_audio_len:
+            zero_data = torch.zeros(self.corresponding_audio_len)
+            zero_data[: audio_data.shape[0]] = audio_data
+            audio_data = zero_data
+        elif audio_data.shape[0] > self.corresponding_audio_len:
+            audio_data = audio_data[: self.corresponding_audio_len]
+        # uniformly sample image frames from video [tentative solution]
+        interval = video_data.shape[0] // self.num_sample_frames
+        video_data = video_data[::interval][: self.num_sample_frames]
+        assert (
+            video_data.shape[0] == self.num_sample_frames
+        ), f"number of sampled image frames is {video_data.shape[0]}"
+        assert (
+            audio_data.shape[0] == self.corresponding_audio_len
+        ), f"number of audio samples is {audio_data.shape[0]}"
+        # video transformation
+        video_data = video_data / 255.0
+        video_data = video_data.permute(0, 3, 1, 2)  # [N, H, W, C] -> [N, C, H, W]
+        # calculate mel fbank of waveform for audio encoder
+        audio_data = audio_data.unsqueeze(0)  # [1, samples]
+        audio_data = audio_data - audio_data.mean()
+        fbank = torchaudio.compliance.kaldi.fbank(
+            audio_data,
+            htk_compat=True,
+            sample_frequency=self.sampling_rate,
+            use_energy=False,
+            window_type="hanning",
+            num_mel_bins=self.melbins,
+            dither=0.0,
+            frame_shift=10,
+        )
+        # cut and pad
+        n_frames = fbank.shape[0]
+        p = self.TARGET_LEN - n_frames
+        if p > 0:
+            m = torch.nn.ZeroPad2d((0, 0, 0, p))
+            fbank = m(fbank)
+        elif p < 0:
+            fbank = fbank[0 : self.TARGET_LEN, :]
+        # SpecAug for training (not for eval)
+        freqm = torchaudio.transforms.FrequencyMasking(self.freqm)
+        timem = torchaudio.transforms.TimeMasking(self.timem)
+        fbank = fbank.transpose(0, 1).unsqueeze(0)  # 1, 128, 1024 (...,freq,time)
+        if self.freqm != 0:
+            fbank = freqm(fbank)
+        if self.timem != 0:
+            fbank = timem(fbank)  # (..., freq, time)
+        fbank = torch.transpose(fbank.squeeze(), 0, 1)  # time, freq
+        fbank = (fbank - self.norm_mean) / (self.norm_std * 2)
+        fbank = fbank.unsqueeze(0)
+        if self.return_label:
+            # get audioset lebel indexes
+            label_indices = np.zeros(527)
+            for label_str in labels.split(","):
+                label_indices[int(self.audioset_label2idx[label_str])] = 1.0
+            label_indices = torch.FloatTensor(label_indices)
+            data_dict = {
+                "labels": label_indices,
+                "images": video_data,
+                "fbank": fbank,
+                # 'modality': 'audio_visual'
+            }
+        else:
+            data_dict = {
+                "images": video_data,
+                "fbank": fbank,
+                # 'modality': 'audio_visual'
+            }
+        return data_dict
+def collate_fn(list_data_dict):
+    r"""Collate mini-batch data to inputs and targets for training.
+    Args:
+        list_data_dict: e.g., [
+            {'vocals': (channels_num, segment_samples),
+             'accompaniment': (channels_num, segment_samples),
+             'mixture': (channels_num, segment_samples)
+            },
+            {'vocals': (channels_num, segment_samples),
+             'accompaniment': (channels_num, segment_samples),
+             'mixture': (channels_num, segment_samples)
+            },
+            ...]
+    Returns:
+        data_dict: e.g. {
+            'vocals': (batch_size, channels_num, segment_samples),
+            'accompaniment': (batch_size, channels_num, segment_samples),
+            'mixture': (batch_size, channels_num, segment_samples)
+            }
+    """
+    data_dict = {}
+    for key in list_data_dict[0].keys():
+        # for key in ['waveform']:
+        # try:
+        data_dict[key] = [data_dict[key] for data_dict in list_data_dict]
+        # except:
+        #     from IPython import embed; embed(using=False); os._exit(0)
+        data_dict[key] = torch.stack(data_dict[key])
+    return data_dict

qa_mdt/audioldm_train/modules/audiomae/example.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from timm.models.layers import to_2tuple
+import models_vit
+from audiovisual_dataset import AudioVisualDataset, collate_fn
+from torch.utils.data import DataLoader
+from util.stat import calculate_stats
+from tqdm import tqdm
+from AudioMAE import AudioMAE
+if __name__ == "__main__":
+    device = "cuda"
+    dataset = AudioVisualDataset(
+        datafiles=[
+            "/mnt/bn/data-xubo/dataset/audioset_videos/datafiles/audioset_eval.json"
+        ],
+        # disable SpecAug during evaluation
+        freqm=0,
+        timem=0,
+        return_label=True,
+    )
+    model = AudioMAE().to(device)
+    model.eval()
+    outputs = []
+    targets = []
+    dataloader = DataLoader(
+        dataset, batch_size=64, num_workers=8, shuffle=False, collate_fn=collate_fn
+    )
+    print("Start evaluation on AudioSet ...")
+    with torch.no_grad():
+        for data in tqdm(dataloader):
+            fbank = data["fbank"]  # [B, 1, T, F]
+            fbank = fbank.to(device)
+            output = model(fbank, mask_t_prob=0.0, mask_f_prob=0.0)
+            target = data["labels"]
+            outputs.append(output)
+            targets.append(target)
+    outputs = torch.cat(outputs).cpu().numpy()
+    targets = torch.cat(targets).cpu().numpy()
+    stats = calculate_stats(outputs, targets)
+    AP = [stat["AP"] for stat in stats]
+    mAP = np.mean([stat["AP"] for stat in stats])
+    print("Done ... mAP: {:.6f}".format(mAP))
+    # mAP: 0.463003

qa_mdt/audioldm_train/modules/audiomae/models_mae.py ADDED Viewed

	@@ -0,0 +1,615 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+from functools import partial
+from json import encoder
+import torch
+import torch.nn as nn
+from timm.models.vision_transformer import Block
+from qa_mdt.audioldm_train.modules.audiomae.util.pos_embed import (
+    get_2d_sincos_pos_embed,
+    get_2d_sincos_pos_embed_flexible,
+    get_1d_sincos_pos_embed_from_grid,
+)
+from qa_mdt.audioldm_train.modules.audiomae.util.patch_embed import (
+    PatchEmbed_new,
+    PatchEmbed_org,
+)
+class MaskedAutoencoderViT(nn.Module):
+    """Masked Autoencoder with VisionTransformer backbone"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        stride=10,
+        in_chans=3,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        decoder_embed_dim=512,
+        decoder_depth=8,
+        decoder_num_heads=16,
+        mlp_ratio=4.0,
+        norm_layer=nn.LayerNorm,
+        norm_pix_loss=False,
+        audio_exp=False,
+        alpha=0.0,
+        temperature=0.2,
+        mode=0,
+        contextual_depth=8,
+        use_custom_patch=False,
+        split_pos=False,
+        pos_trainable=False,
+        use_nce=False,
+        beta=4.0,
+        decoder_mode=0,
+        mask_t_prob=0.6,
+        mask_f_prob=0.5,
+        mask_2d=False,
+        epoch=0,
+        no_shift=False,
+    ):
+        super().__init__()
+        self.audio_exp = audio_exp
+        self.embed_dim = embed_dim
+        self.decoder_embed_dim = decoder_embed_dim
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        if use_custom_patch:
+            print(
+                f"Use custom patch_emb with patch size: {patch_size}, stride: {stride}"
+            )
+            self.patch_embed = PatchEmbed_new(
+                img_size=img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim,
+                stride=stride,
+            )
+        else:
+            self.patch_embed = PatchEmbed_org(img_size, patch_size, in_chans, embed_dim)
+        self.use_custom_patch = use_custom_patch
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.split_pos = split_pos # not useful
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dim), requires_grad=pos_trainable
+        )  # fixed sin-cos embedding
+        self.encoder_depth = depth
+        self.contextual_depth = contextual_depth
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )  # qk_scale=None
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, decoder_embed_dim),
+            requires_grad=pos_trainable,
+        )  # fixed sin-cos embedding
+        self.no_shift = no_shift
+        self.decoder_mode = decoder_mode
+        if (
+            self.use_custom_patch
+        ):  # overlapped patches as in AST. Similar performance yet compute heavy
+            window_size = (6, 6)
+            feat_size = (102, 12)
+        else:
+            window_size = (4, 4)
+            feat_size = (64, 8)
+        if self.decoder_mode == 1:
+            decoder_modules = []
+            for index in range(16):
+                if self.no_shift:
+                    shift_size = (0, 0)
+                else:
+                    if (index % 2) == 0:
+                        shift_size = (0, 0)
+                    else:
+                        shift_size = (2, 0)
+                    # shift_size = tuple([0 if ((index % 2) == 0) else w // 2 for w in window_size])
+                decoder_modules.append(
+                    SwinTransformerBlock(
+                        dim=decoder_embed_dim,
+                        num_heads=16,
+                        feat_size=feat_size,
+                        window_size=window_size,
+                        shift_size=shift_size,
+                        mlp_ratio=mlp_ratio,
+                        drop=0.0,
+                        drop_attn=0.0,
+                        drop_path=0.0,
+                        extra_norm=False,
+                        sequential_attn=False,
+                        norm_layer=norm_layer,  # nn.LayerNorm,
+                    )
+                )
+            self.decoder_blocks = nn.ModuleList(decoder_modules)
+        else:
+            # Transfomer
+            self.decoder_blocks = nn.ModuleList(
+                [
+                    Block(
+                        decoder_embed_dim,
+                        decoder_num_heads,
+                        mlp_ratio,
+                        qkv_bias=True,
+                        norm_layer=norm_layer,
+                    )  # qk_scale=None,
+                    for i in range(decoder_depth)
+                ]
+            )
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(
+            decoder_embed_dim, patch_size**2 * in_chans, bias=True
+        )  # decoder to patch
+        # --------------------------------------------------------------------------
+        self.norm_pix_loss = norm_pix_loss
+        self.patch_size = patch_size
+        self.stride = stride
+        # audio exps
+        self.alpha = alpha
+        self.T = temperature
+        self.mode = mode
+        self.use_nce = use_nce
+        self.beta = beta
+        self.log_softmax = nn.LogSoftmax(dim=-1)
+        self.mask_t_prob = mask_t_prob
+        self.mask_f_prob = mask_f_prob
+        self.mask_2d = mask_2d
+        self.epoch = epoch
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        if self.audio_exp:
+            pos_embed = get_2d_sincos_pos_embed_flexible(
+                self.pos_embed.shape[-1], self.patch_embed.patch_hw, cls_token=True
+            )
+        else:
+            pos_embed = get_2d_sincos_pos_embed(
+                self.pos_embed.shape[-1],
+                int(self.patch_embed.num_patches**0.5),
+                cls_token=True,
+            )
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        if self.audio_exp:
+            decoder_pos_embed = get_2d_sincos_pos_embed_flexible(
+                self.decoder_pos_embed.shape[-1],
+                self.patch_embed.patch_hw,
+                cls_token=True,
+            )
+        else:
+            decoder_pos_embed = get_2d_sincos_pos_embed(
+                self.decoder_pos_embed.shape[-1],
+                int(self.patch_embed.num_patches**0.5),
+                cls_token=True,
+            )
+        self.decoder_pos_embed.data.copy_(
+            torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)
+        )
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=0.02)
+        torch.nn.init.normal_(self.mask_token, std=0.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        L = (H/p)*(W/p)
+        """
+        p = self.patch_embed.patch_size[0]
+        # assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        if self.audio_exp:
+            if self.use_custom_patch:  # overlapped patch
+                h, w = self.patch_embed.patch_hw
+                # todo: fixed h/w patch size and stride size. Make hw custom in the future
+                x = imgs.unfold(2, self.patch_size, self.stride).unfold(
+                    3, self.patch_size, self.stride
+                )  # n,1,H,W -> n,1,h,w,p,p
+                x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 1))
+                # x = imgs.reshape(shape=(imgs.shape[0], 1, h, p, w, p))
+                # x = torch.einsum('nchpwq->nhwpqc', x)
+                # x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 1))
+            else:
+                h = imgs.shape[2] // p
+                w = imgs.shape[3] // p
+                # h,w = self.patch_embed.patch_hw
+                x = imgs.reshape(shape=(imgs.shape[0], 1, h, p, w, p))
+                x = torch.einsum("nchpwq->nhwpqc", x)
+                x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 1))
+        else:
+            h = w = imgs.shape[2] // p
+            x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+            x = torch.einsum("nchpwq->nhwpqc", x)
+            x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        return x
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        specs: (N, 1, H, W)
+        """
+        p = self.patch_embed.patch_size[0]
+        h = 1024 // p
+        w = 128 // p
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, 1))
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        specs = x.reshape(shape=(x.shape[0], 1, h * p, w * p))
+        return specs
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return x_masked, mask, ids_restore
+    def random_masking_2d(self, x, mask_t_prob, mask_f_prob):
+        """
+        2D: Spectrogram (msking t and f under mask_t_prob and mask_f_prob)
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        if self.use_custom_patch:  # overlapped patch
+            T = 101
+            F = 12
+        else:
+            T = 64
+            F = 8
+        # x = x.reshape(N, T, F, D)
+        len_keep_t = int(T * (1 - mask_t_prob))
+        len_keep_f = int(F * (1 - mask_f_prob))
+        # noise for mask in time
+        noise_t = torch.rand(N, T, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample aling time
+        ids_shuffle_t = torch.argsort(
+            noise_t, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore_t = torch.argsort(ids_shuffle_t, dim=1)
+        ids_keep_t = ids_shuffle_t[:, :len_keep_t]
+        # noise mask in freq
+        noise_f = torch.rand(N, F, device=x.device)  # noise in [0, 1]
+        ids_shuffle_f = torch.argsort(
+            noise_f, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore_f = torch.argsort(ids_shuffle_f, dim=1)
+        ids_keep_f = ids_shuffle_f[:, :len_keep_f]  #
+        # generate the binary mask: 0 is keep, 1 is remove
+        # mask in freq
+        mask_f = torch.ones(N, F, device=x.device)
+        mask_f[:, :len_keep_f] = 0
+        mask_f = (
+            torch.gather(mask_f, dim=1, index=ids_restore_f)
+            .unsqueeze(1)
+            .repeat(1, T, 1)
+        )  # N,T,F
+        # mask in time
+        mask_t = torch.ones(N, T, device=x.device)
+        mask_t[:, :len_keep_t] = 0
+        mask_t = (
+            torch.gather(mask_t, dim=1, index=ids_restore_t)
+            .unsqueeze(1)
+            .repeat(1, F, 1)
+            .permute(0, 2, 1)
+        )  # N,T,F
+        mask = 1 - (1 - mask_t) * (1 - mask_f)  # N, T, F
+        # get masked x
+        id2res = torch.Tensor(list(range(N * T * F))).reshape(N, T, F).to(x.device)
+        id2res = id2res + 999 * mask  # add a large value for masked elements
+        id2res2 = torch.argsort(id2res.flatten(start_dim=1))
+        ids_keep = id2res2.flatten(start_dim=1)[:, : len_keep_f * len_keep_t]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        ids_restore = torch.argsort(id2res2.flatten(start_dim=1))
+        mask = mask.flatten(start_dim=1)
+        return x_masked, mask, ids_restore
+    def forward_encoder(self, x, mask_ratio, mask_2d=False):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+        # masking: length -> length * mask_ratio
+        if mask_2d:
+            x, mask, ids_restore = self.random_masking_2d(
+                x, mask_t_prob=self.mask_t_prob, mask_f_prob=self.mask_f_prob
+            )
+        else:
+            x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x, mask, ids_restore, None
+    def forward_encoder_no_random_mask_no_average(self, x):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+        # masking: length -> length * mask_ratio
+        # if mask_2d:
+        #     x, mask, ids_restore = self.random_masking_2d(x, mask_t_prob=self.mask_t_prob, mask_f_prob=self.mask_f_prob)
+        # else:
+        #     x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x
+    def forward_encoder_no_mask(self, x):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+        # masking: length -> length * mask_ratio
+        # x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        contextual_embs = []
+        for n, blk in enumerate(self.blocks):
+            x = blk(x)
+            if n > self.contextual_depth:
+                contextual_embs.append(self.norm(x))
+        # x = self.norm(x)
+        contextual_emb = torch.stack(contextual_embs, dim=0).mean(dim=0)
+        return contextual_emb
+    def forward_decoder(self, x, ids_restore):
+        # embed tokens
+        x = self.decoder_embed(x)
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(
+            x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        x_ = torch.gather(
+            x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
+        # add pos embed
+        x = x + self.decoder_pos_embed
+        if self.decoder_mode != 0:
+            B, L, D = x.shape
+            x = x[:, 1:, :]
+            if self.use_custom_patch:
+                x = x.reshape(B, 101, 12, D)
+                x = torch.cat([x, x[:, -1, :].unsqueeze(1)], dim=1)  # hack
+                x = x.reshape(B, 1224, D)
+        if self.decoder_mode > 3:  # mvit
+            x = self.decoder_blocks(x)
+        else:
+            # apply Transformer blocks
+            for blk in self.decoder_blocks:
+                x = blk(x)
+        x = self.decoder_norm(x)
+        # predictor projection
+        pred = self.decoder_pred(x)
+        # remove cls token
+        if self.decoder_mode != 0:
+            if self.use_custom_patch:
+                pred = pred.reshape(B, 102, 12, 256)
+                pred = pred[:, :101, :, :]
+                pred = pred.reshape(B, 1212, 256)
+            else:
+                pred = pred
+        else:
+            pred = pred[:, 1:, :]
+        return pred, None, None  # emb, emb_pixel
+    def forward_loss(self, imgs, pred, mask, norm_pix_loss=False):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove,
+        """
+        target = self.patchify(imgs)
+        if norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+    def forward(self, imgs, mask_ratio=0.8):
+        emb_enc, mask, ids_restore, _ = self.forward_encoder(
+            imgs, mask_ratio, mask_2d=self.mask_2d
+        )
+        pred, _, _ = self.forward_decoder(emb_enc, ids_restore)  # [N, L, p*p*3]
+        loss_recon = self.forward_loss(
+            imgs, pred, mask, norm_pix_loss=self.norm_pix_loss
+        )
+        loss_contrastive = torch.FloatTensor([0.0]).cuda()
+        return loss_recon, pred, mask, loss_contrastive
+def mae_vit_small_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        decoder_embed_dim=512,
+        decoder_num_heads=16,
+        mlp_ratio=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs,
+    )
+    return model
+def mae_vit_base_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        decoder_embed_dim=512,
+        decoder_num_heads=16,
+        mlp_ratio=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs,
+    )
+    return model
+def mae_vit_large_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        decoder_embed_dim=512,
+        decoder_num_heads=16,
+        mlp_ratio=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs,
+    )
+    return model
+def mae_vit_huge_patch14_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViT(
+        patch_size=14,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        decoder_embed_dim=512,
+        decoder_num_heads=16,
+        mlp_ratio=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs,
+    )
+    return model
+# set recommended archs
+mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_small_patch16 = mae_vit_small_patch16_dec512d8b  # decoder: 512 dim, 8 blocks

qa_mdt/audioldm_train/modules/audiomae/models_vit.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+from functools import partial
+import torch
+import torch.nn as nn
+import numpy as np
+import timm.models.vision_transformer
+from timm.models.vision_transformer import PatchEmbed, Block
+from qa_mdt.audioldm_train.modules.audiomae.util.patch_embed import (
+    PatchEmbed_new,
+    PatchEmbed3D_new,
+)
+class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
+    """Vision Transformer with support for global average pooling"""
+    def __init__(
+        self, global_pool=False, mask_2d=True, use_custom_patch=False, **kwargs
+    ):
+        super(VisionTransformer, self).__init__(**kwargs)
+        self.global_pool = global_pool
+        if self.global_pool:
+            norm_layer = kwargs["norm_layer"]
+            embed_dim = kwargs["embed_dim"]
+            self.fc_norm = norm_layer(embed_dim)
+        del self.norm  # remove the original norm
+        self.mask_2d = mask_2d
+        self.use_custom_patch = use_custom_patch
+        num_heads = 12
+        depth = 12
+        mlp_ratio = 4
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        x = x + self.pos_embed[:, 1:, :]
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        if self.global_pool:
+            x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
+            outcome = self.fc_norm(x)
+        else:
+            x = self.norm(x)
+            outcome = x[:, 0]
+        return outcome
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return x_masked, mask, ids_restore
+    def random_masking_2d(self, x, mask_t_prob, mask_f_prob):
+        """
+        2D: Spectrogram (msking t and f under mask_t_prob and mask_f_prob)
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        if self.use_custom_patch:
+            # # for AS
+            T = 101  # 64,101
+            F = 12  # 8,12
+            # # for ESC
+            # T=50
+            # F=12
+            # for SPC
+            # T=12
+            # F=12
+        else:
+            # ## for AS
+            T = 64
+            F = 8
+            # ## for ESC
+            # T=32
+            # F=8
+            ## for SPC
+            # T=8
+            # F=8
+        # mask T
+        x = x.reshape(N, T, F, D)
+        len_keep_T = int(T * (1 - mask_t_prob))
+        noise = torch.rand(N, T, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_keep = ids_shuffle[:, :len_keep_T]
+        index = ids_keep.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, F, D)
+        # x_masked = torch.gather(x, dim=1, index=index)
+        # x_masked = x_masked.reshape(N,len_keep_T*F,D)
+        x = torch.gather(x, dim=1, index=index)  # N, len_keep_T(T'), F, D
+        # mask F
+        # x = x.reshape(N, T, F, D)
+        x = x.permute(0, 2, 1, 3)  # N T' F D => N F T' D
+        len_keep_F = int(F * (1 - mask_f_prob))
+        noise = torch.rand(N, F, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(
+            noise, dim=1
+        )  # ascend: small is keep, large is remove
+        ids_keep = ids_shuffle[:, :len_keep_F]
+        # index = ids_keep.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, T, D)
+        index = ids_keep.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, len_keep_T, D)
+        x_masked = torch.gather(x, dim=1, index=index)
+        x_masked = x_masked.permute(0, 2, 1, 3)  # N F' T' D => N T' F' D
+        # x_masked = x_masked.reshape(N,len_keep*T,D)
+        x_masked = x_masked.reshape(N, len_keep_F * len_keep_T, D)
+        return x_masked, None, None
+    def forward_features_mask(self, x, mask_t_prob, mask_f_prob):
+        B = x.shape[0]  # 4,1,1024,128
+        x = self.patch_embed(x)  # 4, 512, 768
+        x = x + self.pos_embed[:, 1:, :]
+        if self.random_masking_2d:
+            x, mask, ids_restore = self.random_masking_2d(x, mask_t_prob, mask_f_prob)
+        else:
+            x, mask, ids_restore = self.random_masking(x, mask_t_prob)
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        cls_tokens = cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_drop(x)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        if self.global_pool:
+            x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
+            outcome = self.fc_norm(x)
+        else:
+            x = self.norm(x)
+            outcome = x[:, 0]
+        return outcome
+    # overwrite original timm
+    def forward(self, x, v=None, mask_t_prob=0.0, mask_f_prob=0.0):
+        if mask_t_prob > 0.0 or mask_f_prob > 0.0:
+            x = self.forward_features_mask(
+                x, mask_t_prob=mask_t_prob, mask_f_prob=mask_f_prob
+            )
+        else:
+            x = self.forward_features(x)
+        x = self.head(x)
+        return x
+def vit_small_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+def vit_base_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+def vit_large_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+def vit_huge_patch14(**kwargs):
+    model = VisionTransformer(
+        patch_size=14,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .sequence_input import Sequence2AudioMAE
2	+ from .model import CLAP2AudioMAE

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (264 Bytes). View file

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (7.18 kB). View file

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc ADDED Viewed

Binary file (13.6 kB). View file

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/model.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from qa_mdt.audioldm_train.utilities.model_util import (
+    exists,
+    default,
+    mean_flat,
+    count_params,
+    instantiate_from_config,
+)
+from transformers import GPT2Config, GPT2Model
+import torch.optim.lr_scheduler as lr_scheduler
+class Prenet(nn.Module):
+    def __init__(self, in_dim, sizes=[256, 128], dropout_rate=0.5):
+        super(Prenet, self).__init__()
+        in_sizes = [in_dim] + sizes[:-1]
+        self.layers = nn.ModuleList(
+            [
+                nn.Linear(in_size, out_size)
+                for (in_size, out_size) in zip(in_sizes, sizes)
+            ]
+        )
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, inputs):
+        for linear in self.layers:
+            inputs = self.dropout(self.relu(linear(inputs)))
+        return inputs
+class CLAP2AudioMAE(pl.LightningModule):
+    def __init__(
+        self,
+        sequence_gen_length,
+        base_learning_rate,
+        cond_stage_config,
+        use_audiomae_linear=False,
+        **kwargs
+    ):
+        super().__init__()
+        assert use_audiomae_linear == False
+        self.learning_rate = base_learning_rate
+        self.cond_stage_config = cond_stage_config
+        self.use_audiomae_linear = use_audiomae_linear
+        self.mae_token_num = sequence_gen_length  # 4*4 pooling of the audiomae latent
+        self.cond_stage_models = nn.ModuleList([])
+        self.instantiate_cond_stage(cond_stage_config)
+        self.model = GPT2Model.from_pretrained("gpt2")
+        self.linear_clap = nn.Linear(512, 768)
+        if use_audiomae_linear:
+            # self.linear_audiomae = nn.Linear(768, 768) # TODO remove linear_audiomae
+            self.linear_audiomae = None  # TODO remove linear_audiomae
+        self.loss_fn = nn.MSELoss()
+        self.logger_save_dir = None
+        self.logger_exp_name = None
+        self.logger_exp_group_name = None
+        self.logger_version = None
+    def set_log_dir(self, save_dir, exp_group_name, exp_name):
+        self.logger_save_dir = save_dir
+        self.logger_exp_group_name = exp_group_name
+        self.logger_exp_name = exp_name
+    def cfg_uncond(self, batch_size):
+        unconditional_conditioning = {}
+        for key in self.cond_stage_model_metadata:
+            model_idx = self.cond_stage_model_metadata[key]["model_idx"]
+            unconditional_conditioning[key] = self.cond_stage_models[
+                model_idx
+            ].get_unconditional_condition(batch_size)
+        assert (
+            "crossattn_audiomae_pooled" in unconditional_conditioning.keys()
+        ), "The module is not initialized with AudioMAE"
+        unconditional_conditioning[
+            "crossattn_clap_to_audiomae_feature"
+        ] = unconditional_conditioning["crossattn_audiomae_pooled"]
+        return unconditional_conditioning
+    def configure_optimizers(self):
+        lr = float(self.learning_rate)
+        params = list(self.model.parameters()) + list(self.linear_clap.parameters())
+        if self.use_audiomae_linear:
+            params += list(self.linear_audiomae.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        scheduler = lr_scheduler.StepLR(opt, step_size=1, gamma=0.9)
+        return [opt], [scheduler]
+    def training_step(self, batch, batch_idx=None, cond_dict=None):
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        input_embeds, target_embeds = (
+            cond_dict["film_clap_cond1"],
+            cond_dict["crossattn_audiomae_pooled"][0],
+        )
+        # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately
+        if "crossattn_audiomae_pooled_44" in cond_dict.keys():
+            target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0]
+        if self.use_audiomae_linear:
+            input_embeds = torch.cat(
+                [self.linear_clap(input_embeds), self.linear_audiomae(target_embeds)],
+                dim=1,
+            )
+        else:
+            input_embeds = torch.cat(
+                [self.linear_clap(input_embeds), target_embeds], dim=1
+            )
+        output_embeds = self.model(inputs_embeds=input_embeds)["last_hidden_state"]
+        target = target_embeds
+        output = output_embeds[:, :-1]
+        loss = self.loss_fn(output, target)
+        self.log(
+            "train/loss_clap_2_audiomae",
+            loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        self.log(
+            "global_step_audiomae",
+            float(self.global_step),
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        return loss
+    def generate(self, batch, cond_dict=None, no_grad=False):
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        input_embeds = cond_dict["film_clap_cond1"]
+        steps = self.mae_token_num
+        if no_grad:
+            with torch.no_grad():
+                model_input = self.linear_clap(input_embeds)
+                for _ in range(steps):
+                    output = self.model(inputs_embeds=model_input)["last_hidden_state"]
+                    model_input = torch.cat([model_input, output[:, -1:, :]], dim=1)
+        else:
+            model_input = self.linear_clap(input_embeds)
+            for _ in range(steps):
+                output = self.model(inputs_embeds=model_input)["last_hidden_state"]
+                model_input = torch.cat([model_input, output[:, -1:, :]], dim=1)
+        return model_input[:, 1:], cond_dict
+    # def on_validation_epoch_start(self) -> None:
+    #     # Use text as condition during validation
+    #     for key in self.cond_stage_model_metadata.keys():
+    #         metadata = self.cond_stage_model_metadata[key]
+    #         model_idx, cond_stage_key, conditioning_key = metadata["model_idx"], metadata["cond_stage_key"], metadata["conditioning_key"]
+    #         # If we use CLAP as condition, we might use audio for training, but we also must use text for evaluation
+    #         # if(isinstance(self.cond_stage_models[model_idx], CLAPAudioEmbeddingClassifierFreev2)):
+    #         #     self.cond_stage_model_metadata[key]["cond_stage_key_orig"] = self.cond_stage_model_metadata[key]["cond_stage_key"]
+    #         #     self.cond_stage_model_metadata[key]["embed_mode_orig"] = self.cond_stage_models[model_idx].embed_mode
+    #         #     print("Change the model original cond_keyand embed_mode %s, %s to text during evaluation" % (self.cond_stage_model_metadata[key]["cond_stage_key_orig"], self.cond_stage_model_metadata[key]["embed_mode_orig"]))
+    #         #     self.cond_stage_model_metadata[key]["cond_stage_key"] = "text"
+    #         #     self.cond_stage_models[model_idx].embed_mode = "text"
+    #     return super().on_validation_epoch_start()
+    def validation_step(self, batch, batch_idx):
+        cond_dict = self.get_input(batch)
+        # cond_dict['film_clap_cond1']: [2,1,512]
+        # cond_dict['crossattn_audiomae_pooled']: [2, 128, 768]
+        input_embeds, target_embeds = (
+            cond_dict["film_clap_cond1"],
+            cond_dict["crossattn_audiomae_pooled"][0],
+        )
+        # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately
+        if "crossattn_audiomae_pooled_44" in cond_dict.keys():
+            target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0]
+        if self.use_audiomae_linear:
+            input_embeds = torch.cat(
+                [self.linear_clap(input_embeds), self.linear_audiomae(target_embeds)],
+                dim=1,
+            )
+        else:
+            input_embeds = torch.cat(
+                [self.linear_clap(input_embeds), target_embeds], dim=1
+            )
+        output_embeds = self.model(inputs_embeds=input_embeds)["last_hidden_state"]
+        target = target_embeds
+        output = output_embeds[:, :-1]
+        loss = self.loss_fn(output, target)
+        self.log(
+            "val/loss",
+            loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            sync_dist=True,
+            on_epoch=True,
+        )
+        generation_output, _ = self.generate(batch)
+        ar_gen_loss = self.loss_fn(generation_output, target)
+        self.log(
+            "val/ar_gen_loss",
+            ar_gen_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            sync_dist=True,
+            on_epoch=True,
+        )
+        return {"loss": loss, "ar_gen_loss": ar_gen_loss}
+    def get_input_item(self, batch, k):
+        fname, text, label_indices, waveform, stft, fbank = (
+            batch["fname"],
+            batch["text"],
+            batch["label_vector"],
+            batch["waveform"],
+            batch["stft"],
+            batch["log_mel_spec"],
+        )
+        ret = {}
+        ret["fbank"] = (
+            fbank.unsqueeze(1).to(memory_format=torch.contiguous_format).float()
+        )
+        ret["stft"] = stft.to(memory_format=torch.contiguous_format).float()
+        # ret["clip_label"] = clip_label.to(memory_format=torch.contiguous_format).float()
+        ret["waveform"] = waveform.to(memory_format=torch.contiguous_format).float()
+        ret["text"] = list(text)
+        ret["fname"] = fname
+        for key in batch.keys():
+            if key not in ret.keys():
+                ret[key] = batch[key]
+        return ret[k]
+    def get_input(self, batch):
+        cond_dict = {}
+        if len(self.cond_stage_model_metadata.keys()) > 0:
+            unconditional_cfg = False
+            for cond_model_key in self.cond_stage_model_metadata.keys():
+                cond_stage_key = self.cond_stage_model_metadata[cond_model_key][
+                    "cond_stage_key"
+                ]
+                # if(not self.training):
+                #     if(isinstance(self.cond_stage_models[self.cond_stage_model_metadata[cond_model_key]["model_idx"]], CLAPAudioEmbeddingClassifierFreev2)):
+                #         assert cond_stage_key == "text" # CLAP model should use text for evaluation
+                # The original data for conditioning
+                xc = self.get_input_item(batch, cond_stage_key)
+                if type(xc) == torch.Tensor:
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(
+                    xc, key=cond_model_key, unconditional_cfg=unconditional_cfg
+                )
+                cond_dict[cond_model_key] = c
+        return cond_dict
+    def instantiate_cond_stage(self, config):
+        self.cond_stage_model_metadata = {}
+        for i, cond_model_key in enumerate(config.keys()):
+            model = instantiate_from_config(config[cond_model_key])
+            self.cond_stage_models.append(model)
+            self.cond_stage_model_metadata[cond_model_key] = {
+                "model_idx": i,
+                "cond_stage_key": config[cond_model_key]["cond_stage_key"],
+                "conditioning_key": config[cond_model_key]["conditioning_key"],
+            }
+    def get_learned_conditioning(self, c, key, unconditional_cfg):
+        assert key in self.cond_stage_model_metadata.keys()
+        # Classifier-free guidance
+        if not unconditional_cfg:
+            c = self.cond_stage_models[
+                self.cond_stage_model_metadata[key]["model_idx"]
+            ](c)
+        else:
+            if isinstance(c, torch.Tensor):
+                batchsize = c.size(0)
+            elif isinstance(c, list):
+                batchsize = len(c)
+            else:
+                raise NotImplementedError()
+            c = self.cond_stage_models[
+                self.cond_stage_model_metadata[key]["model_idx"]
+            ].get_unconditional_condition(batchsize)
+        return c

qa_mdt/audioldm_train/modules/audiomae/sequence_gen/sequence_input.py ADDED Viewed

	@@ -0,0 +1,737 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import pytorch_lightning as pl
+from qa_mdt.audioldm_train.utilities.model_util import (
+    exists,
+    default,
+    mean_flat,
+    count_params,
+    instantiate_from_config,
+)
+from torch.optim import *
+from transformers import GPT2Config, GPT2Model, GPTJConfig, GPTJModel
+import torch.optim.lr_scheduler as lr_scheduler
+class Sequence2AudioMAE(pl.LightningModule):
+    def __init__(
+        self,
+        base_learning_rate,
+        sequence_gen_length,
+        sequence_input_key,
+        sequence_input_embed_dim,
+        cond_stage_config,
+        optimizer_type="AdamW",
+        use_warmup=True,
+        use_ar_gen_loss=False,
+        use_audiomae_linear=False,
+        target_tokens_mask_ratio=0.0,
+        random_mask_ratio=False,
+        **kwargs
+    ):
+        super().__init__()
+        assert use_audiomae_linear == False
+        self.random_mask_ratio = random_mask_ratio
+        self.learning_rate = base_learning_rate
+        self.cond_stage_config = cond_stage_config
+        self.use_audiomae_linear = use_audiomae_linear
+        self.optimizer_type = optimizer_type
+        self.use_warmup = use_warmup
+        self.use_ar_gen_loss = use_ar_gen_loss
+        # Even though the LDM can be conditioned on mutliple pooling rate
+        # Our model always predict the higest pooling rate
+        self.mae_token_num = sequence_gen_length
+        self.sequence_input_key = sequence_input_key
+        self.sequence_input_embed_dim = sequence_input_embed_dim
+        self.target_tokens_mask_ratio = target_tokens_mask_ratio
+        self.start_of_sequence_tokens = nn.Embedding(32, 768)
+        self.end_of_sequence_tokens = nn.Embedding(32, 768)
+        self.input_sequence_embed_linear = nn.ModuleList([])
+        self.initial_learning_rate = None
+        for dim in self.sequence_input_embed_dim:
+            self.input_sequence_embed_linear.append(nn.Linear(dim, 768))
+        self.cond_stage_models = nn.ModuleList([])
+        self.instantiate_cond_stage(cond_stage_config)
+        self.initialize_param_check_toolkit()
+        self.private_training_step = 0
+        # configuration = GPT2Config(n_layer=1) # TODO
+        # self.model=GPT2Model(configuration)
+        ###################
+        # self.model=nn.Linear(768,768, bias=False) # TODO change the model
+        # with torch.no_grad():
+        #     self.model.weight.copy_(torch.eye(768))
+        ###################
+        self.model = GPT2Model.from_pretrained("gpt2")
+        ###################
+        # self.model = nn.LSTM(input_size=768, hidden_size=768, num_layers=1,bias=False) # TODO
+        # self.loss_fn = nn.MSELoss()
+        self.loss_fn = nn.L1Loss()
+        self.logger_save_dir = None
+        self.logger_exp_name = None
+        self.logger_exp_group_name = None
+        self.logger_version = None
+    def set_log_dir(self, save_dir, exp_group_name, exp_name):
+        self.logger_save_dir = save_dir
+        self.logger_exp_group_name = exp_group_name
+        self.logger_exp_name = exp_name
+    def cfg_uncond(self, batch_size):
+        unconditional_conditioning = {}
+        for key in self.cond_stage_model_metadata:
+            model_idx = self.cond_stage_model_metadata[key]["model_idx"]
+            unconditional_conditioning[key] = self.cond_stage_models[
+                model_idx
+            ].get_unconditional_condition(batch_size)
+        assert (
+            "crossattn_audiomae_pooled" in unconditional_conditioning.keys()
+        ), "The module is not initialized with AudioMAE"
+        unconditional_conditioning[
+            "crossattn_clap_to_audiomae_feature"
+        ] = unconditional_conditioning["crossattn_audiomae_pooled"]
+        return unconditional_conditioning
+    def configure_optimizers(self):
+        lr = float(self.learning_rate)
+        # params = list(self.model.parameters()) + list(self.input_sequence_embed_linear.parameters())
+        params = list(self.parameters())
+        # opt = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.98), eps=1e-9)
+        opt = eval(self.optimizer_type)(params, lr=lr)
+        scheduler = lr_scheduler.StepLR(opt, step_size=10, gamma=0.8)
+        return [opt], [scheduler]
+    def add_sos_eos_tokens(self, _id, sequence, attn_mask):
+        batchsize = sequence.size(0)
+        new_attn_mask_step = torch.ones((batchsize, 1)).to(sequence.device)
+        key_id = torch.tensor([_id]).to(sequence.device)
+        # Add two more steps to attn mask
+        new_attn_mask = torch.cat(
+            [new_attn_mask_step, attn_mask, new_attn_mask_step], dim=1
+        )
+        # Add two more tokens in the sequence
+        sos_token = self.start_of_sequence_tokens(key_id).expand(batchsize, 1, -1)
+        eos_token = self.end_of_sequence_tokens(key_id).expand(batchsize, 1, -1)
+        new_sequence = torch.cat([sos_token, sequence, eos_token], dim=1)
+        return new_sequence, new_attn_mask
+    def truncate_sequence_and_mask(self, sequence, mask, max_len=512):
+        if sequence.size(1) > max_len:
+            print(
+                "The input sequence length to GPT-2 model is too long:",
+                sequence.size(1),
+            )
+            return sequence[:, :max_len], mask[:, :max_len]
+        else:
+            return sequence, mask
+    def get_input_sequence_and_mask(self, cond_dict):
+        input_embeds = None
+        input_embeds_attn_mask = None
+        for _id, sequence_key in enumerate(self.sequence_input_key):
+            assert sequence_key in cond_dict.keys(), (
+                "Invalid sequence key %s" % sequence_key
+            )
+            cond_embed = cond_dict[sequence_key]
+            if isinstance(cond_embed, list):
+                assert (
+                    len(cond_embed) == 2
+                ), "The crossattn returned list should have length 2, including embed and attn_mask"
+                item_input_embeds, item_attn_mask = cond_embed
+                item_input_embeds = self.input_sequence_embed_linear[_id](
+                    item_input_embeds
+                )
+                item_input_embeds, item_attn_mask = self.add_sos_eos_tokens(
+                    _id, item_input_embeds, item_attn_mask
+                )
+                if input_embeds is None and input_embeds_attn_mask is None:
+                    input_embeds, input_embeds_attn_mask = (
+                        item_input_embeds,
+                        item_attn_mask,
+                    )
+                else:
+                    input_embeds = torch.cat(
+                        [input_embeds, item_input_embeds], dim=1
+                    )  # The 1-st dimension is time steps
+                    input_embeds_attn_mask = torch.cat(
+                        [input_embeds_attn_mask, item_attn_mask], dim=1
+                    )  # The 1-st dimension is time steps
+            else:
+                assert isinstance(cond_embed, torch.Tensor)
+                cond_embed = self.input_sequence_embed_linear[_id](cond_embed)
+                attn_mask = torch.ones((cond_embed.size(0), cond_embed.size(1))).to(
+                    cond_embed.device
+                )
+                item_input_embeds, item_attn_mask = self.add_sos_eos_tokens(
+                    _id, cond_embed, attn_mask
+                )
+                if input_embeds is None and input_embeds_attn_mask is None:
+                    input_embeds, input_embeds_attn_mask = (
+                        item_input_embeds,
+                        item_attn_mask,
+                    )
+                else:
+                    input_embeds, input_embeds_attn_mask = torch.cat(
+                        [input_embeds, item_input_embeds], dim=1
+                    ), torch.cat([input_embeds_attn_mask, item_attn_mask], dim=1)
+        assert input_embeds is not None and input_embeds_attn_mask is not None
+        input_embeds, input_embeds_attn_mask = self.truncate_sequence_and_mask(
+            input_embeds, input_embeds_attn_mask, int(1024 - self.mae_token_num)
+        )
+        cond_sequence_end_time_idx = input_embeds.size(
+            1
+        )  # The index that we start to collect the output embeds
+        return input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx
+    def warmup_step(self):
+        if self.initial_learning_rate is None:
+            self.initial_learning_rate = float(self.learning_rate)
+        # Only the first parameter group
+        if self.global_step <= 1000:
+            if self.global_step == 0:
+                print(
+                    "Warming up learning rate start with %s"
+                    % self.initial_learning_rate
+                )
+            self.trainer.optimizers[0].param_groups[0]["lr"] = (
+                self.global_step / 1000
+            ) * self.initial_learning_rate
+        else:
+            # TODO set learning rate here
+            self.trainer.optimizers[0].param_groups[0][
+                "lr"
+            ] = self.initial_learning_rate
+    def mask_target_sequence(self, target_embeds, target_embeds_attn_mask):
+        time_seq_mask = None
+        if self.target_tokens_mask_ratio > 1e-4:
+            batchsize, time_seq_len, embed_dim = target_embeds.size()
+            _, time_seq_len = target_embeds_attn_mask.size()
+            # Generate random mask
+            if self.random_mask_ratio:
+                mask_ratio = torch.rand(1).item() * self.target_tokens_mask_ratio
+            else:
+                mask_ratio = self.target_tokens_mask_ratio
+            time_seq_mask = (torch.rand((batchsize, time_seq_len)) > mask_ratio).to(
+                target_embeds.device
+            )
+            # Mask the target embedding
+            target_embeds = target_embeds * time_seq_mask.unsqueeze(-1)
+            target_embeds_attn_mask = target_embeds_attn_mask * time_seq_mask
+        return target_embeds, target_embeds_attn_mask, time_seq_mask
+    def training_step(self, batch, batch_idx=None, cond_dict=None, return_output=False):
+        # cond_dict['film_clap_cond1']: [2,1,512]
+        # cond_dict['crossattn_audiomae_pooled']: [2, 128, 768]
+        if self.use_warmup:
+            self.warmup_step()
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        # param_list = list(self.model.parameters())
+        target_embeds, target_embeds_attn_mask = (
+            cond_dict["crossattn_audiomae_pooled"][0],
+            cond_dict["crossattn_audiomae_pooled"][1],
+        )
+        (
+            input_embeds,
+            input_embeds_attn_mask,
+            cond_sequence_end_time_idx,
+        ) = self.get_input_sequence_and_mask(cond_dict)
+        # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately
+        if "crossattn_audiomae_pooled_44" in cond_dict.keys():
+            target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0]
+        # target_embeds, target_embeds_attn_mask, time_seq_mask = self.mask_target_sequence(target_embeds, target_embeds_attn_mask)
+        final_input_embeds = torch.cat([input_embeds, target_embeds], dim=1)
+        final_input_embeds_attn_mask = torch.cat(
+            [input_embeds_attn_mask, target_embeds_attn_mask], dim=1
+        )
+        ########################### GPT-2
+        output_embeds = self.model(
+            inputs_embeds=final_input_embeds,
+            attention_mask=final_input_embeds_attn_mask,
+        )["last_hidden_state"]
+        ########################### DNN
+        # output_embeds = self.model(final_input_embeds)
+        ########################### LSTM
+        # output_embeds,_ = self.model(final_input_embeds)
+        target = target_embeds
+        output = output_embeds[:, cond_sequence_end_time_idx - 1 : -1]
+        # output = output_embeds[:, cond_sequence_end_time_idx: ] # TODO bug here intentionally
+        assert target.size(1) == self.mae_token_num
+        # if(batch_idx % 1000 == 0):
+        #     print(output[0], target[0])
+        loss = self.loss_fn(output, target)
+        if self.use_ar_gen_loss:
+            ar_gen_loss = self.calculate_ahead_k_step_loss(batch, batch_idx, cond_dict)
+        else:
+            ar_gen_loss = loss
+        if self.private_training_step % 500 == 0:
+            print(
+                "AudioMAE prediction module:", "loss", loss, "ar_gen_loss", ar_gen_loss
+            )
+        try:
+            learning_rate = self.trainer.optimizers[0].param_groups[0]["lr"]
+            self.log(
+                "train/lr_audiomae_pred",
+                learning_rate,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=False,
+                sync_dist=True,
+            )
+        except:
+            pass
+        self.log(
+            "train/loss_clap_2_audiomae",
+            loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        self.log(
+            "train/loss_ar_gen_loss",
+            ar_gen_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        self.log(
+            "global_step_audiomae",
+            float(self.global_step),
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        self.private_training_step += 1
+        if return_output:
+            return loss + ar_gen_loss, output
+        else:
+            return loss + ar_gen_loss
+    def calculate_ahead_k_step_loss(self, batch, batch_idx=None, cond_dict=None):
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        target_embeds, target_embeds_attn_mask = (
+            cond_dict["crossattn_audiomae_pooled"][0],
+            cond_dict["crossattn_audiomae_pooled"][1],
+        )
+        assert (
+            torch.sum(target_embeds_attn_mask < 0.1) < 1
+        ), "This function only works for AudioMAE prediction, which should have all one atten_mask"
+        (
+            input_embeds,
+            input_embeds_attn_mask,
+            cond_sequence_end_time_idx,
+        ) = self.get_input_sequence_and_mask(cond_dict)
+        target_total_time_steps = target_embeds.size(1)
+        steps = min(round(torch.rand(1).item() * 8), target_total_time_steps)
+        if steps < 2:
+            steps = 2
+        start_idx = max(
+            0, round(torch.rand(1).item() * (target_total_time_steps - steps)) - 1
+        )
+        model_input = input_embeds
+        model_input_mask = input_embeds_attn_mask
+        target_embeds_ar_gen = target_embeds[:, start_idx : start_idx + steps, :]
+        generation = []
+        if start_idx > 0:
+            model_input = torch.cat(
+                [input_embeds, target_embeds[:, :start_idx, :]], dim=1
+            )
+            attention_mask_known_steps = torch.ones(
+                (model_input_mask.size(0), start_idx)
+            ).to(model_input.device)
+            model_input_mask = torch.cat(
+                [input_embeds_attn_mask, attention_mask_known_steps], dim=1
+            )
+        for _ in range(steps):
+            output = self.model(
+                inputs_embeds=model_input, attention_mask=model_input_mask
+            )["last_hidden_state"]
+            # Update the model input
+            generation.append(output[:, -1:, :])
+            model_input = torch.cat([model_input, output[:, -1:, :]], dim=1)
+            # Update the attention mask
+            attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to(
+                model_input.device
+            )
+            model_input_mask = torch.cat(
+                [model_input_mask, attention_mask_new_step], dim=1
+            )
+        generation = torch.cat(generation, dim=1)
+        return self.loss_fn(generation, target_embeds_ar_gen)
+    def generate_partial(self, batch, cond_dict=None, no_grad=False):
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        print("Generate partially prompted audio with in-context learning")
+        # self.model.train()
+        # assert self.model.training==True
+        target_embeds, target_embeds_attn_mask = (
+            cond_dict["crossattn_audiomae_pooled"][0],
+            cond_dict["crossattn_audiomae_pooled"][1],
+        )
+        target_time_steps = target_embeds.size(1)
+        (
+            input_embeds,
+            input_embeds_attn_mask,
+            cond_sequence_end_time_idx,
+        ) = self.get_input_sequence_and_mask(cond_dict)
+        model_input = torch.cat(
+            [input_embeds, target_embeds[:, : target_time_steps // 4, :]], dim=1
+        )
+        model_input_mask = torch.cat(
+            [
+                input_embeds_attn_mask,
+                target_embeds_attn_mask[:, : target_time_steps // 4],
+            ],
+            dim=1,
+        )
+        steps = self.mae_token_num
+        for _ in range(3 * steps // 4):
+            output = self.model(
+                inputs_embeds=model_input, attention_mask=model_input_mask
+            )["last_hidden_state"]
+            # Update the model input
+            model_input = torch.cat([model_input, output[:, -1:, :]], dim=1)
+            # Update the attention mask
+            attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to(
+                model_input.device
+            )
+            model_input_mask = torch.cat(
+                [model_input_mask, attention_mask_new_step], dim=1
+            )
+        output = model_input[:, cond_sequence_end_time_idx:]
+        return output, cond_dict
+    def generate(self, batch, cond_dict=None, no_grad=False):
+        if cond_dict is None:
+            cond_dict = self.get_input(batch)
+        # self.model.train()
+        # print("!!!!!!!!!!!!!train")
+        (
+            input_embeds,
+            input_embeds_attn_mask,
+            cond_sequence_end_time_idx,
+        ) = self.get_input_sequence_and_mask(cond_dict)
+        model_input = input_embeds
+        model_input_mask = input_embeds_attn_mask
+        steps = self.mae_token_num
+        for _ in range(steps):
+            output = self.model(
+                inputs_embeds=model_input, attention_mask=model_input_mask
+            )["last_hidden_state"]
+            # Update the model input
+            model_input = torch.cat([model_input, output[:, -1:, :]], dim=1)
+            # Update the attention mask
+            attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to(
+                model_input.device
+            )
+            model_input_mask = torch.cat(
+                [model_input_mask, attention_mask_new_step], dim=1
+            )
+        return model_input[:, cond_sequence_end_time_idx:], cond_dict
+    # def on_validation_epoch_start(self) -> None:
+    #     # Use text as condition during validation
+    #     for key in self.cond_stage_model_metadata.keys():
+    #         metadata = self.cond_stage_model_metadata[key]
+    #         model_idx, cond_stage_key, conditioning_key = metadata["model_idx"], metadata["cond_stage_key"], metadata["conditioning_key"]
+    #         # If we use CLAP as condition, we might use audio for training, but we also must use text for evaluation
+    #         # if(isinstance(self.cond_stage_models[model_idx], CLAPAudioEmbeddingClassifierFreev2)):
+    #         #     self.cond_stage_model_metadata[key]["cond_stage_key_orig"] = self.cond_stage_model_metadata[key]["cond_stage_key"]
+    #         #     self.cond_stage_model_metadata[key]["embed_mode_orig"] = self.cond_stage_models[model_idx].embed_mode
+    #         #     print("Change the model original cond_keyand embed_mode %s, %s to text during evaluation" % (self.cond_stage_model_metadata[key]["cond_stage_key_orig"], self.cond_stage_model_metadata[key]["embed_mode_orig"]))
+    #         #     self.cond_stage_model_metadata[key]["cond_stage_key"] = "text"
+    #         #     self.cond_stage_models[model_idx].embed_mode = "text"
+    #     return super().on_validation_epoch_start()
+    def validation_step(self, batch, batch_idx):
+        cond_dict = self.get_input(batch)
+        # cond_dict['film_clap_cond1']: [2,1,512]
+        # cond_dict['crossattn_audiomae_pooled']: [2, 128, 768]
+        target_embeds, target_embeds_attn_mask = (
+            cond_dict["crossattn_audiomae_pooled"][0],
+            cond_dict["crossattn_audiomae_pooled"][1],
+        )
+        (
+            input_embeds,
+            input_embeds_attn_mask,
+            cond_sequence_end_time_idx,
+        ) = self.get_input_sequence_and_mask(cond_dict)
+        # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately
+        if "crossattn_audiomae_pooled_44" in cond_dict.keys():
+            target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0]
+        final_input_embeds = torch.cat([input_embeds, target_embeds], dim=1)
+        final_input_embeds_attn_mask = torch.cat(
+            [input_embeds_attn_mask, target_embeds_attn_mask], dim=1
+        )
+        output_embeds = self.model(
+            inputs_embeds=final_input_embeds,
+            attention_mask=final_input_embeds_attn_mask,
+        )["last_hidden_state"]
+        target = target_embeds
+        output = output_embeds[:, cond_sequence_end_time_idx - 1 : -1]
+        loss = self.loss_fn(output, target)
+        self.log(
+            "val/loss",
+            loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            sync_dist=True,
+            on_epoch=True,
+        )
+        generation_output, _ = self.generate(batch)
+        ar_gen_loss = self.loss_fn(generation_output, target)
+        self.log(
+            "val/ar_gen_loss",
+            ar_gen_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            sync_dist=True,
+            on_epoch=True,
+        )
+        return {"loss": loss, "ar_gen_loss": ar_gen_loss}
+    def get_input_item(self, batch, k):
+        fname, text, label_indices, waveform, stft, fbank = (
+            batch["fname"],
+            batch["text"],
+            batch["label_vector"],
+            batch["waveform"],
+            batch["stft"],
+            batch["log_mel_spec"],
+        )
+        ret = {}
+        ret["fbank"] = (
+            fbank.unsqueeze(1).to(memory_format=torch.contiguous_format).float()
+        )
+        ret["stft"] = stft.to(memory_format=torch.contiguous_format).float()
+        # ret["clip_label"] = clip_label.to(memory_format=torch.contiguous_format).float()
+        ret["waveform"] = waveform.to(memory_format=torch.contiguous_format).float()
+        ret["text"] = list(text)
+        ret["fname"] = fname
+        for key in batch.keys():
+            if key not in ret.keys():
+                ret[key] = batch[key]
+        return ret[k]
+    def get_input(self, batch):
+        cond_dict = {}
+        if len(self.cond_stage_model_metadata.keys()) > 0:
+            unconditional_cfg = False
+            for cond_model_key in self.cond_stage_model_metadata.keys():
+                cond_stage_key = self.cond_stage_model_metadata[cond_model_key][
+                    "cond_stage_key"
+                ]
+                # if(not self.training):
+                #     if(isinstance(self.cond_stage_models[self.cond_stage_model_metadata[cond_model_key]["model_idx"]], CLAPAudioEmbeddingClassifierFreev2)):
+                #         assert cond_stage_key == "text" # CLAP model should use text for evaluation
+                # The original data for conditioning
+                xc = self.get_input_item(batch, cond_stage_key)
+                if type(xc) == torch.Tensor:
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(
+                    xc, key=cond_model_key, unconditional_cfg=unconditional_cfg
+                )
+                cond_dict[cond_model_key] = c
+        return cond_dict
+    def instantiate_cond_stage(self, config):
+        self.cond_stage_model_metadata = {}
+        for i, cond_model_key in enumerate(config.keys()):
+            model = instantiate_from_config(config[cond_model_key])
+            self.cond_stage_models.append(model)
+            self.cond_stage_model_metadata[cond_model_key] = {
+                "model_idx": i,
+                "cond_stage_key": config[cond_model_key]["cond_stage_key"],
+                "conditioning_key": config[cond_model_key]["conditioning_key"],
+            }
+    def get_learned_conditioning(self, c, key, unconditional_cfg):
+        assert key in self.cond_stage_model_metadata.keys()
+        # Classifier-free guidance
+        if not unconditional_cfg:
+            c = self.cond_stage_models[
+                self.cond_stage_model_metadata[key]["model_idx"]
+            ](c)
+        else:
+            if isinstance(c, torch.Tensor):
+                batchsize = c.size(0)
+            elif isinstance(c, list):
+                batchsize = len(c)
+            else:
+                raise NotImplementedError()
+            c = self.cond_stage_models[
+                self.cond_stage_model_metadata[key]["model_idx"]
+            ].get_unconditional_condition(batchsize)
+        return c
+    def initialize_param_check_toolkit(self):
+        self.tracked_steps = 0
+        self.param_dict = {}
+    def statistic_require_grad_tensor_number(self, module, name=None):
+        requires_grad_num = 0
+        total_num = 0
+        require_grad_tensor = None
+        for p in module.parameters():
+            if p.requires_grad:
+                requires_grad_num += 1
+                if require_grad_tensor is None:
+                    require_grad_tensor = p
+            total_num += 1
+        print(
+            "Module: [%s] have %s trainable parameters out of %s total parameters (%.2f)"
+            % (name, requires_grad_num, total_num, requires_grad_num / total_num)
+        )
+        return require_grad_tensor
+    def check_module_param_update(self):
+        if self.tracked_steps == 0:
+            print("Sequence2AudioMAE")
+            for name, module in self.named_children():
+                try:
+                    require_grad_tensor = self.statistic_require_grad_tensor_number(
+                        module, name=name
+                    )
+                    if require_grad_tensor is not None:
+                        self.param_dict[name] = require_grad_tensor.clone()
+                    else:
+                        print("==> %s does not requires grad" % name)
+                except Exception as e:
+                    print("%s does not have trainable parameters: %s" % (name, e))
+                    continue
+        if self.tracked_steps % 5000 == 0:
+            print("Sequence2AudioMAE")
+            for name, module in self.named_children():
+                try:
+                    require_grad_tensor = self.statistic_require_grad_tensor_number(
+                        module, name=name
+                    )
+                    if require_grad_tensor is not None:
+                        print(
+                            "===> Param diff %s: %s; Size: %s"
+                            % (
+                                name,
+                                torch.sum(
+                                    torch.abs(
+                                        self.param_dict[name] - require_grad_tensor
+                                    )
+                                ),
+                                require_grad_tensor.size(),
+                            )
+                        )
+                    else:
+                        print("%s does not requires grad" % name)
+                except Exception as e:
+                    print("%s does not have trainable parameters: %s" % (name, e))
+                    continue
+        self.tracked_steps += 1

qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc ADDED Viewed

Binary file (3.42 kB). View file

qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc ADDED Viewed

Binary file (4.33 kB). View file

qa_mdt/audioldm_train/modules/audiomae/util/crop.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+from torchvision import transforms
+from torchvision.transforms import functional as F
+class RandomResizedCrop(transforms.RandomResizedCrop):
+    """
+    RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
+    This may lead to results different with torchvision's version.
+    Following BYOL's TF code:
+    https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
+    """
+    @staticmethod
+    def get_params(img, scale, ratio):
+        width, height = F._get_image_size(img)
+        area = height * width
+        target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+        log_ratio = torch.log(torch.tensor(ratio))
+        aspect_ratio = torch.exp(
+            torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
+        ).item()
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+        w = min(w, width)
+        h = min(h, height)
+        i = torch.randint(0, height - h + 1, size=(1,)).item()
+        j = torch.randint(0, width - w + 1, size=(1,)).item()
+        return i, j, h, w

qa_mdt/audioldm_train/modules/audiomae/util/datasets.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+import os
+import PIL
+from torchvision import datasets, transforms
+from timm.data import create_transform
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+def build_dataset(is_train, args):
+    transform = build_transform(is_train, args)
+    root = os.path.join(args.data_path, "train" if is_train else "val")
+    dataset = datasets.ImageFolder(root, transform=transform)
+    print(dataset)
+    return dataset
+def build_transform(is_train, args):
+    mean = IMAGENET_DEFAULT_MEAN
+    std = IMAGENET_DEFAULT_STD
+    # train transform
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation="bicubic",
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+            mean=mean,
+            std=std,
+        )
+        return transform
+    # eval transform
+    t = []
+    if args.input_size <= 224:
+        crop_pct = 224 / 256
+    else:
+        crop_pct = 1.0
+    size = int(args.input_size / crop_pct)
+    t.append(
+        transforms.Resize(
+            size, interpolation=PIL.Image.BICUBIC
+        ),  # to maintain same ratio w.r.t. 224 images
+    )
+    t.append(transforms.CenterCrop(args.input_size))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(mean, std))
+    return transforms.Compose(t)

qa_mdt/audioldm_train/modules/audiomae/util/lars.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# LARS optimizer, implementation from MoCo v3:
+# https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+import torch
+class LARS(torch.optim.Optimizer):
+    """
+    LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
+    """
+    def __init__(
+        self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001
+    ):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            trust_coefficient=trust_coefficient,
+        )
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self):
+        for g in self.param_groups:
+            for p in g["params"]:
+                dp = p.grad
+                if dp is None:
+                    continue
+                if p.ndim > 1:  # if not normalization gamma/beta or bias
+                    dp = dp.add(p, alpha=g["weight_decay"])
+                    param_norm = torch.norm(p)
+                    update_norm = torch.norm(dp)
+                    one = torch.ones_like(param_norm)
+                    q = torch.where(
+                        param_norm > 0.0,
+                        torch.where(
+                            update_norm > 0,
+                            (g["trust_coefficient"] * param_norm / update_norm),
+                            one,
+                        ),
+                        one,
+                    )
+                    dp = dp.mul(q)
+                param_state = self.state[p]
+                if "mu" not in param_state:
+                    param_state["mu"] = torch.zeros_like(p)
+                mu = param_state["mu"]
+                mu.mul_(g["momentum"]).add_(dp)
+                p.add_(mu, alpha=-g["lr"])

qa_mdt/audioldm_train/modules/audiomae/util/lr_decay.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# ELECTRA https://github.com/google-research/electra
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import json
+def param_groups_lrd(
+    model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=0.75
+):
+    """
+    Parameter groups for layer-wise lr decay
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    param_group_names = {}
+    param_groups = {}
+    num_layers = len(model.blocks) + 1
+    layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        # no decay: all 1D parameters and model specific ones
+        if p.ndim == 1 or n in no_weight_decay_list:
+            g_decay = "no_decay"
+            this_decay = 0.0
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+        layer_id = get_layer_id_for_vit(n, num_layers)
+        group_name = "layer_%d_%s" % (layer_id, g_decay)
+        if group_name not in param_group_names:
+            this_scale = layer_scales[layer_id]
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+        param_group_names[group_name]["params"].append(n)
+        param_groups[group_name]["params"].append(p)
+    # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
+    return list(param_groups.values())
+def get_layer_id_for_vit(name, num_layers):
+    """
+    Assign a parameter with its layer id
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+    """
+    if name in ["cls_token", "pos_embed"]:
+        return 0
+    elif name.startswith("patch_embed"):
+        return 0
+    elif name.startswith("blocks"):
+        return int(name.split(".")[1]) + 1
+    else:
+        return num_layers

qa_mdt/audioldm_train/modules/audiomae/util/lr_sched.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * (
+            1.0
+            + math.cos(
+                math.pi
+                * (epoch - args.warmup_epochs)
+                / (args.epochs - args.warmup_epochs)
+            )
+        )
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr

qa_mdt/audioldm_train/modules/audiomae/util/misc.py ADDED Viewed

	@@ -0,0 +1,454 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import builtins
+import datetime
+import os
+import time
+from collections import defaultdict, deque
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from torch._six import inf
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+        )
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        log_msg = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_msg.append("max mem: {memory:.0f}")
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(
+            "{} Total time: {} ({:.4f} s / it)".format(
+                header, total_time_str, total_time / len(iterable)
+            )
+        )
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print("[{}] ".format(now), end="")  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
+        args.world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
+        args.gpu = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
+        args.dist_url = "tcp://%s:%s" % (
+            os.environ["MASTER_ADDR"],
+            os.environ["MASTER_PORT"],
+        )
+        os.environ["LOCAL_RANK"] = str(args.gpu)
+        os.environ["RANK"] = str(args.rank)
+        os.environ["WORLD_SIZE"] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}): {}, gpu {}".format(
+            args.rank, args.dist_url, args.gpu
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(
+        self,
+        loss,
+        optimizer,
+        clip_grad=None,
+        parameters=None,
+        create_graph=False,
+        update_grad=True,
+    ):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(
+                    optimizer
+                )  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.0)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(
+            torch.stack(
+                [torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]
+            ),
+            norm_type,
+        )
+    return total_norm
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler):
+    output_dir = Path(args.output_dir)
+    epoch_name = str(epoch)
+    if loss_scaler is not None:
+        checkpoint_paths = [output_dir / ("checkpoint-%s.pth" % epoch_name)]
+        for checkpoint_path in checkpoint_paths:
+            to_save = {
+                "model": model_without_ddp.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "epoch": epoch,
+                "scaler": loss_scaler.state_dict(),
+                "args": args,
+            }
+            save_on_master(to_save, checkpoint_path)
+    else:
+        client_state = {"epoch": epoch}
+        model.save_checkpoint(
+            save_dir=args.output_dir,
+            tag="checkpoint-%s" % epoch_name,
+            client_state=client_state,
+        )
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    if args.resume:
+        if args.resume.startswith("https"):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location="cpu", check_hash=True
+            )
+        else:
+            checkpoint = torch.load(args.resume, map_location="cpu")
+        model_without_ddp.load_state_dict(checkpoint["model"])
+        print("Resume checkpoint %s" % args.resume)
+        if (
+            "optimizer" in checkpoint
+            and "epoch" in checkpoint
+            and not (hasattr(args, "eval") and args.eval)
+        ):
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            args.start_epoch = checkpoint["epoch"] + 1
+            if "scaler" in checkpoint:
+                loss_scaler.load_state_dict(checkpoint["scaler"])
+            print("With optim & sched!")
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x
+# utils
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def merge_vmae_to_avmae(avmae_state_dict, vmae_ckpt):
+    # keys_to_copy=['pos_embed','patch_embed']
+    # replaced=0
+    vmae_ckpt["cls_token"] = vmae_ckpt["cls_token_v"]
+    vmae_ckpt["mask_token"] = vmae_ckpt["mask_token_v"]
+    # pos_emb % not trainable, use default
+    pos_embed_v = vmae_ckpt["pos_embed_v"]  # 1,589,768
+    pos_embed = pos_embed_v[:, 1:, :]  # 1,588,768
+    cls_embed = pos_embed_v[:, 0, :].unsqueeze(1)
+    pos_embed = pos_embed.reshape(1, 2, 14, 14, 768).sum(dim=1)  # 1, 14, 14, 768
+    print("Position interpolate from 14,14 to 64,8")
+    pos_embed = pos_embed.permute(0, 3, 1, 2)  # 1, 14,14,768 -> 1,768,14,14
+    pos_embed = torch.nn.functional.interpolate(
+        pos_embed, size=(64, 8), mode="bicubic", align_corners=False
+    )
+    pos_embed = pos_embed.permute(0, 2, 3, 1).flatten(
+        1, 2
+    )  # 1, 14, 14, 768 => 1, 196,768
+    pos_embed = torch.cat((cls_embed, pos_embed), dim=1)
+    assert vmae_ckpt["pos_embed"].shape == pos_embed.shape
+    vmae_ckpt["pos_embed"] = pos_embed
+    # patch_emb
+    # aggregate 3 channels in video-rgb ckpt to 1 channel for audio
+    v_weight = vmae_ckpt["patch_embed_v.proj.weight"]  # 768,3,2,16,16
+    new_proj_weight = torch.nn.Parameter(v_weight.sum(dim=2).sum(dim=1).unsqueeze(1))
+    assert new_proj_weight.shape == vmae_ckpt["patch_embed.proj.weight"].shape
+    vmae_ckpt["patch_embed.proj.weight"] = new_proj_weight
+    vmae_ckpt["patch_embed.proj.bias"] = vmae_ckpt["patch_embed_v.proj.bias"]
+    # hack
+    vmae_ckpt["norm.weight"] = vmae_ckpt["norm_v.weight"]
+    vmae_ckpt["norm.bias"] = vmae_ckpt["norm_v.bias"]
+    # replace transformer encoder
+    for k, v in vmae_ckpt.items():
+        if k.startswith("blocks."):
+            kk = k.replace("blocks.", "blocks_v.")
+            vmae_ckpt[k] = vmae_ckpt[kk]
+        elif k.startswith("blocks_v."):
+            pass
+        else:
+            print(k)
+            pass
+    print(k)

qa_mdt/audioldm_train/modules/audiomae/util/patch_embed.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import torch.nn as nn
+from timm.models.layers import to_2tuple
+class PatchEmbed_org(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #    f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        y = x.flatten(2).transpose(1, 2)
+        return y
+class PatchEmbed_new(nn.Module):
+    """Flexible Image to Patch Embedding"""
+    def __init__(
+        self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, stride=10
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride
+        )  # with overlapped patches
+        # self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0])
+        # self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        _, _, h, w = self.get_output_shape(img_size)  # n, emb_dim, h, w
+        self.patch_hw = (h, w)
+        self.num_patches = h * w
+    def get_output_shape(self, img_size):
+        # todo: don't be lazy..
+        return self.proj(torch.randn(1, 1, img_size[0], img_size[1])).shape
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #    f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        # x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.proj(x)  # 32, 1, 1024, 128 -> 32, 768, 101, 12
+        x = x.flatten(2)  # 32, 768, 101, 12 -> 32, 768, 1212
+        x = x.transpose(1, 2)  # 32, 768, 1212 -> 32, 1212, 768
+        return x
+class PatchEmbed3D_new(nn.Module):
+    """Flexible Image to Patch Embedding"""
+    def __init__(
+        self,
+        video_size=(16, 224, 224),
+        patch_size=(2, 16, 16),
+        in_chans=3,
+        embed_dim=768,
+        stride=(2, 16, 16),
+    ):
+        super().__init__()
+        self.video_size = video_size
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride
+        )
+        _, _, t, h, w = self.get_output_shape(video_size)  # n, emb_dim, h, w
+        self.patch_thw = (t, h, w)
+        self.num_patches = t * h * w
+    def get_output_shape(self, video_size):
+        # todo: don't be lazy..
+        return self.proj(
+            torch.randn(1, self.in_chans, video_size[0], video_size[1], video_size[2])
+        ).shape
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        x = self.proj(x)  # 32, 3, 16, 224, 224 -> 32, 768, 8, 14, 14
+        x = x.flatten(2)  # 32, 768, 1568
+        x = x.transpose(1, 2)  # 32, 768, 1568 -> 32, 1568, 768
+        return x
+if __name__ == "__main__":
+    # patch_emb = PatchEmbed_new(img_size=224, patch_size=16, in_chans=1, embed_dim=64, stride=(16,16))
+    # input = torch.rand(8,1,1024,128)
+    # output = patch_emb(input)
+    # print(output.shape) # (8,512,64)
+    patch_emb = PatchEmbed3D_new(
+        video_size=(6, 224, 224),
+        patch_size=(2, 16, 16),
+        in_chans=3,
+        embed_dim=768,
+        stride=(2, 16, 16),
+    )
+    input = torch.rand(8, 3, 6, 224, 224)
+    output = patch_emb(input)
+    print(output.shape)  # (8,64)

qa_mdt/audioldm_train/modules/audiomae/util/pos_embed.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    # omega = np.arange(embed_dim // 2, dtype=np.float)
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_pos_embed_img2audio(model, checkpoint_model, orig_size, new_size):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        # orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        # new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size[0], orig_size[1], new_size[0], new_size[1])
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size[0], orig_size[1], embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size[0], new_size[1]),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_pos_embed_audio(model, checkpoint_model, orig_size, new_size):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size[0], orig_size[1], new_size[0], new_size[1])
+            )
+            # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            cls_token = pos_embed_checkpoint[:, 0, :].unsqueeze(1)
+            pos_tokens = pos_embed_checkpoint[:, 1:, :]  # remove
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size[0], orig_size[1], embedding_size
+            )  # .permute(0, 3, 1, 2)
+            # pos_tokens = torch.nn.functional.interpolate(
+            #    pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            # pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            pos_tokens = pos_tokens[:, :, : new_size[1], :]  # assume only time diff
+            pos_tokens = pos_tokens.flatten(1, 2)
+            new_pos_embed = torch.cat((cls_token, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_patch_embed_audio(
+    model,
+    checkpoint_model,
+    orig_channel,
+    new_channel=1,
+    kernel_size=(16, 16),
+    stride=(16, 16),
+    padding=(0, 0),
+):
+    if orig_channel != new_channel:
+        if "patch_embed.proj.weight" in checkpoint_model:
+            # aggregate 3 channels in rgb ckpt to 1 channel for audio
+            new_proj_weight = torch.nn.Parameter(
+                torch.sum(checkpoint_model["patch_embed.proj.weight"], dim=1).unsqueeze(
+                    1
+                )
+            )
+            checkpoint_model["patch_embed.proj.weight"] = new_proj_weight

qa_mdt/audioldm_train/modules/audiomae/util/stat.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+from scipy import stats
+from sklearn import metrics
+import torch
+def d_prime(auc):
+    standard_normal = stats.norm()
+    d_prime = standard_normal.ppf(auc) * np.sqrt(2.0)
+    return d_prime
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def calculate_stats(output, target):
+    """Calculate statistics including mAP, AUC, etc.
+    Args:
+      output: 2d array, (samples_num, classes_num)
+      target: 2d array, (samples_num, classes_num)
+    Returns:
+      stats: list of statistic of each class.
+    """
+    classes_num = target.shape[-1]
+    stats = []
+    # Accuracy, only used for single-label classification such as esc-50, not for multiple label one such as AudioSet
+    acc = metrics.accuracy_score(np.argmax(target, 1), np.argmax(output, 1))
+    # Class-wise statistics
+    for k in range(classes_num):
+        # Average precision
+        avg_precision = metrics.average_precision_score(
+            target[:, k], output[:, k], average=None
+        )
+        # AUC
+        # auc = metrics.roc_auc_score(target[:, k], output[:, k], average=None)
+        # Precisions, recalls
+        (precisions, recalls, thresholds) = metrics.precision_recall_curve(
+            target[:, k], output[:, k]
+        )
+        # FPR, TPR
+        (fpr, tpr, thresholds) = metrics.roc_curve(target[:, k], output[:, k])
+        save_every_steps = 1000  # Sample statistics to reduce size
+        dict = {
+            "precisions": precisions[0::save_every_steps],
+            "recalls": recalls[0::save_every_steps],
+            "AP": avg_precision,
+            "fpr": fpr[0::save_every_steps],
+            "fnr": 1.0 - tpr[0::save_every_steps],
+            # 'auc': auc,
+            # note acc is not class-wise, this is just to keep consistent with other metrics
+            "acc": acc,
+        }
+        stats.append(dict)
+    return stats

qa_mdt/audioldm_train/modules/clap/__init__.py ADDED Viewed

File without changes

qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (148 Bytes). View file

qa_mdt/audioldm_train/modules/clap/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .factory import (
+    list_models,
+    create_model,
+    create_model_and_transforms,
+    add_model_config,
+)
+from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
+from .model import (
+    CLAP,
+    CLAPTextCfg,
+    CLAPVisionCfg,
+    CLAPAudioCfp,
+    convert_weights_to_fp16,
+    trace_model,
+)
+from .openai import load_openai_model, list_openai_models
+from .pretrained import (
+    list_pretrained,
+    list_pretrained_tag_models,
+    list_pretrained_model_tags,
+    get_pretrained_url,
+    download_pretrained,
+)
+from .tokenizer import SimpleTokenizer, tokenize
+from .transform import image_transform

qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (954 Bytes). View file

qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.01 kB). View file