Spaces:

AIGC-Audio
/

AudioLCM

Running on Zero

App Files Files Community

liuhuadai commited on Jun 4

Commit

6efc863

•

1 Parent(s): a00f5de

Upload 340 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
app.py +44 -0
audiocaps_test_16000_struct.tsv +0 -0
configs/audiolcm.yaml +130 -0
configs/autoencoder1d.yaml +74 -0
configs/teacher.yaml +121 -0
infer.sh +4 -0
infer_api.sh +4 -0
ldm/__pycache__/lr_scheduler.cpython-37.pyc +0 -0
ldm/__pycache__/lr_scheduler.cpython-38.pyc +0 -0
ldm/__pycache__/util.cpython-310.pyc +0 -0
ldm/__pycache__/util.cpython-37.pyc +0 -0
ldm/__pycache__/util.cpython-38.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_624.cpython-38.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_anylen.cpython-37.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_anylen.cpython-38.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_struct.cpython-38.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_struct_anylen.cpython-38.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_struct_sample_anylen.cpython-37.pyc +0 -0
ldm/data/__pycache__/joinaudiodataset_struct_sample_anylen.cpython-38.pyc +0 -0
ldm/data/__pycache__/tsvdataset.cpython-38.pyc +0 -0
ldm/data/joinaudiodataset_624.py +93 -0
ldm/data/joinaudiodataset_anylen.py +331 -0
ldm/data/joinaudiodataset_struct.py +95 -0
ldm/data/joinaudiodataset_struct_anylen.py +336 -0
ldm/data/joinaudiodataset_struct_sample.py +103 -0
ldm/data/joinaudiodataset_struct_sample_anylen.py +230 -0
ldm/data/preprocess/NAT_mel.py +131 -0
ldm/data/preprocess/__pycache__/NAT_mel.cpython-38.pyc +0 -0
ldm/data/preprocess/__pycache__/NAT_mel.cpython-39.pyc +0 -0
ldm/data/preprocess/add_duration.py +45 -0
ldm/data/preprocess/mel_spec.py +201 -0
ldm/data/test.py +224 -0
ldm/data/tsv_dirs/full_data/V1_new/audiocaps_train_16000.tsv +0 -0
ldm/data/tsv_dirs/full_data/V2/MACS.tsv +0 -0
ldm/data/tsv_dirs/full_data/V2/WavText5K.tsv +0 -0
ldm/data/tsv_dirs/full_data/V2/adobe.tsv +0 -0
ldm/data/tsv_dirs/full_data/V2/audiostock.tsv +0 -0
ldm/data/tsv_dirs/full_data/V2/epidemic_sound.tsv +3 -0
ldm/data/tsv_dirs/full_data/caps_struct/audiocaps_train_16000_struct2.tsv +0 -0
ldm/data/tsv_dirs/full_data/clotho.tsv +0 -0
ldm/data/tsvdataset.py +67 -0
ldm/lr_scheduler.py +98 -0
ldm/models/__pycache__/autoencoder.cpython-37.pyc +0 -0
ldm/models/__pycache__/autoencoder.cpython-38.pyc +0 -0
ldm/models/__pycache__/autoencoder.cpython-39.pyc +0 -0
ldm/models/__pycache__/autoencoder1d.cpython-37.pyc +0 -0
ldm/models/__pycache__/autoencoder1d.cpython-38.pyc +0 -0
ldm/models/__pycache__/autoencoder_multi.cpython-38.pyc +0 -0
ldm/models/autoencoder.py +504 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ldm/data/tsv_dirs/full_data/V2/epidemic_sound.tsv filter=lfs diff=lfs merge=lfs -text
+vocoder/BigVGAN/LibriTTS/train-full.txt filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio
+def infer(prompt):
+    config = OmegaConf.load("configs/audiolcm.yaml")
+    # print("-------quick debug no load ckpt---------")
+    # model = instantiate_from_config(config['model'])# for quick debug
+    model = load_model_from_config(config,
+                                   "../logs/2024-04-21T14-50-11_text2music-audioset-nonoverlap/epoch=000184.ckpt")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = LCMSampler(model)
+    os.makedirs("results/test", exist_ok=True)
+    vocoder = VocoderBigVGAN("../vocoder/logs/bigvnat16k93.5w", device)
+    generator = GenSamples(sampler, model, "results/test", vocoder, save_mel=False, save_wav=True,
+                           original_inference_steps=config.model.params.num_ddim_timesteps)
+    csv_dicts = []
+    with torch.no_grad():
+        with model.ema_scope():
+            wav_name = f'{prompt.strip().replace(" ", "-")}'
+            generator.gen_test_sample(prompt, wav_name=wav_name)
+    print(f"Your samples are ready and waiting four you here: \nresults/test \nEnjoy.")
+def my_inference_function(prompt_oir):
+    prompt = {'ori_caption':prompt_oir,'struct_caption':prompt_oir}
+    file_path = infer(prompt)
+    return "test.wav"
+gradio_interface = gradio.Interface(
+    fn = my_inference_function,
+    inputs = "text",
+    outputs = "audio"
+)
+gradio_interface.launch()

audiocaps_test_16000_struct.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/audiolcm.yaml ADDED Viewed

	@@ -0,0 +1,130 @@

+model:
+  base_learning_rate: 3.0e-06
+  target: ldm.models.diffusion.lcm_audio.LCM_audio
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    mel_dim: 20
+    mel_length: 312
+    channels: 0
+    cond_stage_trainable: False
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    use_lcm: True
+    num_ddim_timesteps: 50
+    w_min: 4
+    w_max: 12
+    ckpt_path: ../ckpt/maa2.ckpt
+    use_ema: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 10000
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
+      params:
+        in_channels: 20
+        context_dim: 1024
+        hidden_size: 576
+        num_heads: 8
+        depth: 4
+        max_len: 1000
+    first_stage_config:
+      target: ldm.models.autoencoder1d.AutoencoderKL
+      params:
+        embed_dim: 20
+        monitor: val/rec_loss
+        ckpt_path: ./model/AutoencoderKL/epoch=000032.ckpt
+        ddconfig:
+          double_z: true
+          in_channels: 80
+          out_ch: 80
+          z_channels: 20
+          kernel_size: 5
+          ch: 384
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_layers:
+          - 3
+          down_layers:
+          - 0
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
+      params:
+        weights_path: ./model/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        sample_rate: 16000
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: ./vocoder/logs/bigvnat16k93.5w
+  trainer:
+    benchmark: True
+    gradient_clip_val: 1.0
+    replace_sampler_ddp: false
+    max_epochs: 100
+  modelcheckpoint:
+    params:
+      monitor: epoch
+      mode: max
+      # every_n_train_steps: 2000
+      save_top_k: 100
+      every_n_epochs: 3
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 32
+    spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
+    mel_num: 80
+    train:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
+      params:
+        specs_dataset_cfg:
+    validation:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
+      params:
+        specs_dataset_cfg:
+test_dataset:
+  target: ldm.data.tsvdataset.TSVDatasetStruct
+  params:
+    tsv_path: audiocaps_test_16000_struct.tsv
+    spec_crop_len: 624

configs/autoencoder1d.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+model:
+  base_learning_rate: 4.5e-06
+  target: ldm.models.autoencoder1d.AutoencoderKL
+  params:
+    embed_dim: 20
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      in_channels: 80
+      out_ch: 80
+      z_channels: 20
+      kernel_size: 5
+      ch: 384
+      ch_mult:
+      - 1
+      - 2
+      - 4
+      num_res_blocks: 2
+      attn_layers:
+      - 3
+      down_layers:
+      - 0
+      dropout: 0.0
+    lossconfig:
+      target: ldm.modules.losses_audio.contperceptual.LPAPSWithDiscriminator
+      params:
+        disc_start: 80001
+        perceptual_weight: 0.0
+        kl_weight: 1.0e-06
+        disc_weight: 0.5
+        disc_in_channels: 1
+        disc_loss: mse
+        disc_factor: 2
+        disc_conditional: false
+        r1_reg_weight: 3
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        rescale: false
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
+  trainer:
+    sync_batchnorm: false # not working with r1_regularization
+    strategy: ddp
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 16
+    spec_dir_path: ldm/data/tsv_dirs/full_data/V1_new
+    mel_num: 80
+    spec_len: 624
+    spec_crop_len: 624
+    train:
+      target: ldm.data.joinaudiodataset_624.JoinSpecsTrain
+      params:
+        specs_dataset_cfg: null
+    validation:
+      target: ldm.data.joinaudiodataset_624.JoinSpecsValidation
+      params:
+        specs_dataset_cfg: null

configs/teacher.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+model:
+  base_learning_rate: 3.0e-06
+  target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    mel_dim: 20
+    mel_length: 312
+    channels: 0
+    cond_stage_trainable: True
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    use_ema: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 10000
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
+      params:
+        in_channels: 20
+        context_dim: 1024
+        hidden_size: 576
+        num_heads: 8
+        depth: 4
+        max_len: 1000
+    first_stage_config:
+      target: ldm.models.autoencoder1d.AutoencoderKL
+      params:
+        embed_dim: 20
+        monitor: val/rec_loss
+        ckpt_path: logs/trainae/ckpt/epoch=000032.ckpt
+        ddconfig:
+          double_z: true
+          in_channels: 80
+          out_ch: 80
+          z_channels: 20
+          kernel_size: 5
+          ch: 384
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_layers:
+          - 3
+          down_layers:
+          - 0
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
+      params:
+        weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        sample_rate: 16000
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
+  trainer:
+    benchmark: True
+    gradient_clip_val: 1.0
+    replace_sampler_ddp: false
+  modelcheckpoint:
+    params:
+      monitor: epoch
+      mode: max
+      save_top_k: 10
+      every_n_epochs: 5
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 32
+    main_spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
+    other_spec_dir_path: 'ldm/data/tsv_dirs/full_data/V2'
+    mel_num: 80
+    train:
+      target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsTrain
+      params:
+        specs_dataset_cfg:
+    validation:
+      target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsValidation
+      params:
+        specs_dataset_cfg:
+test_dataset:
+  target: ldm.data.tsvdataset.TSVDatasetStruct
+  params:
+    tsv_path: musiccap.tsv
+    spec_crop_len: 624

infer.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+CUDA_VISIBLE_DEVICES='1' python scripts/txt2audio_for_lcm.py  --n_samples 1 --n_iter 1 --scale 5 --H 20 --W 312 \
+--ddim_steps 2 -b configs/audiolcm.yaml \
+--sample_rate 16000 --vocoder-ckpt  ../vocoder/logs/bigvnat16k93.5w \
+--outdir results/test --test-dataset audiocaps  -r ../logs/2024-04-21T14-50-11_text2music-audioset-nonoverlap/epoch=000184.ckpt

infer_api.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+CUDA_VISIBLE_DEVICES='1' python scripts/txt2audio_for_lcm.py  --n_samples 1 --n_iter 1 --scale 5 --H 20 --W 312 \
+--ddim_steps 2 -b configs/audiolcm.yaml \
+--sample_rate 16000 --vocoder-ckpt  ../vocoder/logs/bigvnat16k93.5w \
+--outdir results/test  -r ../logs/2024-04-21T14-50-11_text2music-audioset-nonoverlap/epoch=000184.ckpt --prompt_txt ./prompt.txt

ldm/__pycache__/lr_scheduler.cpython-37.pyc ADDED Viewed

Binary file (3.66 kB). View file

ldm/__pycache__/lr_scheduler.cpython-38.pyc ADDED Viewed

Binary file (3.61 kB). View file

ldm/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (8.36 kB). View file

ldm/__pycache__/util.cpython-37.pyc ADDED Viewed

Binary file (5.1 kB). View file

ldm/__pycache__/util.cpython-38.pyc ADDED Viewed

Binary file (8.3 kB). View file

ldm/data/__pycache__/joinaudiodataset_624.cpython-38.pyc ADDED Viewed

Binary file (3.62 kB). View file

ldm/data/__pycache__/joinaudiodataset_anylen.cpython-37.pyc ADDED Viewed

Binary file (12.4 kB). View file

ldm/data/__pycache__/joinaudiodataset_anylen.cpython-38.pyc ADDED Viewed

Binary file (12.1 kB). View file

ldm/data/__pycache__/joinaudiodataset_struct.cpython-38.pyc ADDED Viewed

Binary file (3.69 kB). View file

ldm/data/__pycache__/joinaudiodataset_struct_anylen.cpython-38.pyc ADDED Viewed

Binary file (12.5 kB). View file

ldm/data/__pycache__/joinaudiodataset_struct_sample_anylen.cpython-37.pyc ADDED Viewed

Binary file (8.29 kB). View file

ldm/data/__pycache__/joinaudiodataset_struct_sample_anylen.cpython-38.pyc ADDED Viewed

Binary file (8.09 kB). View file

ldm/data/__pycache__/tsvdataset.cpython-38.pyc ADDED Viewed

Binary file (2.66 kB). View file

ldm/data/joinaudiodataset_624.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import sys
+import numpy as np
+import torch
+import logging
+import pandas as pd
+import glob
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, mel_num=None, spec_crop_len=None,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.batch_max_length = spec_crop_len
+        self.batch_min_length = 50
+        self.mel_num = mel_num
+        self.drop = drop
+        manifest_files = []
+        for dir_path in spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/**/*.tsv',recursive=True)
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        df = pd.concat(df_list,ignore_index=True)
+        if split == 'train':
+            self.dataset = df.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = df.iloc[:100]
+        elif split == 'test':
+            df = self.add_name_num(df)
+            self.dataset = df
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def __getitem__(self, idx):
+        data = self.dataset.iloc[idx]
+        item = {}
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, 624]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.zeros((self.mel_num,self.batch_max_length)).astype(np.float32)
+        if spec.shape[1] < self.batch_max_length:
+            # spec = np.pad(spec, ((0, 0), (0, self.batch_max_length - spec.shape[1]))) # [80, 624]
+            spec = np.tile(spec,reps=(self.batch_max_length//spec.shape[1])+1)
+        item['image'] = spec[:,:self.batch_max_length]
+        p = np.random.uniform(0,1)
+        if p > self.drop:
+            item["caption"] = data['caption']
+        else:
+            item["caption"] = ""
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)

ldm/data/joinaudiodataset_anylen.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import os
+import sys
+import math
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+from torch.utils.data.distributed import DistributedSampler
+import torch.distributed
+from typing import TypeVar, Optional, Iterator,List
+import logging
+import pandas as pd
+import glob
+import torch.distributed as dist
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, mel_num=80,spec_crop_len=1248,mode='pad',pad_value=-5,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.max_batch_len = spec_crop_len
+        self.min_batch_len = 64
+        self.mel_num = mel_num
+        self.min_factor = 4
+        self.drop = drop
+        self.pad_value = pad_value
+        assert mode in ['pad','tile']
+        self.collate_mode = mode
+        # print(f"################# self.collate_mode {self.collate_mode} ##################")
+        manifest_files = []
+        for dir_path in spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        df = pd.concat(df_list,ignore_index=True)
+        if split == 'train':
+            self.dataset = df.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = df.iloc[:100]
+        elif split == 'test':
+            df = self.add_name_num(df)
+            self.dataset = df
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def ordered_indices(self):
+        index2dur = self.dataset[['duration']]
+        index2dur = index2dur.sort_values(by='duration')
+        return list(index2dur.index)
+    def __getitem__(self, idx):
+        item = {}
+        data = self.dataset.iloc[idx]
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, 624]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.ones((self.mel_num,self.min_batch_len)).astype(np.float32)*self.pad_value
+        item['image'] = spec
+        p = np.random.uniform(0,1)
+        if p > self.drop:
+            item["caption"] = data['caption']
+        else:
+            item["caption"] = ""
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        # item['f_name'] = data['mel_path']
+        return item
+    def collater(self,inputs):
+        to_dict = {}
+        for l in inputs:
+            for k,v in l.items():
+                if k in to_dict:
+                    to_dict[k].append(v)
+                else:
+                    to_dict[k] = [v]
+        if self.collate_mode == 'pad':
+            to_dict['image'] = collate_1d_or_2d(to_dict['image'],pad_idx=self.pad_value,min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        elif self.collate_mode == 'tile':
+            to_dict['image'] = collate_1d_or_2d_tile(to_dict['image'],min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        else:
+            raise NotImplementedError
+        return to_dict
+    def __len__(self):
+        return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class JoinSpecsDebug(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+        self.dataset = self.dataset.iloc[:37]
+class DDPIndexBatchSampler(Sampler):# 让长度相似的音频的indices合到一个batch中以避免过长的pad
+    def __init__(self, indices ,batch_size, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False) -> None:
+        if num_replicas is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                print("Not in distributed mode")
+                num_replicas = 1
+            else:
+                num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                rank = 0
+            else:
+                rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1))
+        self.indices = indices
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.batch_size = batch_size
+        self.batches = self.build_batches()
+        print(f"rank: {self.rank}, batches_num {len(self.batches)}")
+        # If the dataset length is evenly divisible by replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.batches) % self.num_replicas != 0:
+            self.batches = self.batches[:len(self.batches)//self.num_replicas*self.num_replicas]
+        if len(self.batches) > self.num_replicas:
+            self.batches = self.batches[self.rank::self.num_replicas]
+        else: # may happen in sanity checking
+            self.batches = [self.batches[0]]
+        print(f"after split batches_num {len(self.batches)}")
+        self.shuffle = shuffle
+        if self.shuffle:
+            self.batches = np.random.permutation(self.batches)
+        self.seed = seed
+    def set_epoch(self,epoch):
+        self.epoch = epoch
+        if self.shuffle:
+            np.random.seed(self.seed+self.epoch)
+            self.batches = np.random.permutation(self.batches)
+    def build_batches(self):
+        batches,batch = [],[]
+        for index in self.indices:
+            batch.append(index)
+            if len(batch) == self.batch_size:
+                batches.append(batch)
+                batch = []
+        if not self.drop_last and len(batch) > 0:
+            batches.append(batch)
+        return batches
+    def __iter__(self) -> Iterator[List[int]]:
+        for batch in self.batches:
+            yield batch
+    def __len__(self) -> int:
+        return len(self.batches)
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+def collate_1d_or_2d(values, pad_idx=0, left_pad=False, shift_right=False,min_len = None, max_len=None,min_factor=None, shift_id=1):
+    if len(values[0].shape) == 1:
+        return collate_1d(values, pad_idx, left_pad, shift_right,min_len, max_len,min_factor, shift_id)
+    else:
+        return collate_2d(values, pad_idx, left_pad, shift_right,min_len,max_len,min_factor)
+def collate_1d(values, pad_idx=0, left_pad=False, shift_right=False,min_len=None, max_len=None,min_factor=None, shift_id=1):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    res = values[0].new(len(values), size).fill_(pad_idx)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+            dst[0] = shift_id
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
+    return res
+def collate_2d(values, pad_idx=0, left_pad=False, shift_right=False, min_len=None,max_len=None,min_factor=None):
+    """Collate 2d for melspec,Convert a list of 2d tensors into a padded 3d tensor,pad in mel_length dimension.
+        values[0] shape: (melbins,mel_length)
+    """
+    size = max(v.shape[1] for v in values) # if max_len is None else max_len
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    if isinstance(values,np.ndarray):
+        values = torch.FloatTensor(values)
+    if isinstance(values,list):
+        values = [torch.FloatTensor(v) for v in values]
+    res = torch.ones(len(values), values[0].shape[0],size).to(dtype=torch.float32)*pad_idx
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v[:,:size], res[i][:,size - v.shape[1]:] if left_pad else res[i][:,:v.shape[1]])
+    return res
+def collate_1d_or_2d_tile(values, shift_right=False,min_len = None, max_len=None,min_factor=None, shift_id=1):
+    if len(values[0].shape) == 1:
+        return collate_1d_tile(values, shift_right,min_len, max_len,min_factor, shift_id)
+    else:
+        return collate_2d_tile(values, shift_right,min_len,max_len,min_factor)
+def collate_1d_tile(values, shift_right=False,min_len=None, max_len=None,min_factor=None,shift_id=1):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size%min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    res = values[0].new(len(values), size)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+            dst[0] = shift_id
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        n_repeat = math.ceil((size + 1) / v.shape[0])
+        v = torch.tile(v,dims=(1,n_repeat))[:size]
+        copy_tensor(v, res[i])
+    return res
+def collate_2d_tile(values, shift_right=False, min_len=None,max_len=None,min_factor=None):
+    """Collate 2d for melspec,Convert a list of 2d tensors into a padded 3d tensor,pad in mel_length dimension. """
+    size = max(v.shape[1] for v in values) # if max_len is None else max_len
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    if isinstance(values,np.ndarray):
+        values = torch.FloatTensor(values)
+    if isinstance(values,list):
+        values = [torch.FloatTensor(v) for v in values]
+    res = torch.zeros(len(values), values[0].shape[0],size).to(dtype=torch.float32)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if shift_right:
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        n_repeat = math.ceil((size + 1) / v.shape[1])
+        v = torch.tile(v,dims=(1,n_repeat))[:,:size]
+        copy_tensor(v, res[i])
+    return res

ldm/data/joinaudiodataset_struct.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import sys
+import numpy as np
+import torch
+import logging
+import pandas as pd
+import glob
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, mel_num=None, spec_crop_len=None,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.batch_max_length = spec_crop_len
+        self.batch_min_length = 50
+        self.drop = drop
+        self.mel_num = mel_num
+        manifest_files = []
+        for dir_path in spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        df = pd.concat(df_list,ignore_index=True)
+        if split == 'train':
+            self.dataset = df.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = df.iloc[:100]
+        elif split == 'test':
+            df = self.add_name_num(df)
+            self.dataset = df
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def __getitem__(self, idx):
+        data = self.dataset.iloc[idx]
+        item = {}
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, 624]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.zeros((self.mel_num,self.batch_max_length)).astype(np.float32)
+        if spec.shape[1] <= self.batch_max_length:
+            spec = np.pad(spec, ((0, 0), (0, self.batch_max_length - spec.shape[1]))) # [80, 624]
+        item['image'] = spec[:self.mel_num,:self.batch_max_length]
+        p = np.random.uniform(0,1)
+        if p > self.drop:
+            item["caption"] = {"ori_caption":data['ori_cap'],"struct_caption":data['caption']}
+        else:
+            item["caption"] = {"ori_caption":"","struct_caption":""}
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)

ldm/data/joinaudiodataset_struct_anylen.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import os
+import sys
+import math
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+from torch.utils.data.distributed import DistributedSampler
+import torch.distributed
+from typing import TypeVar, Optional, Iterator,List
+import logging
+import pandas as pd
+import glob
+import torch.distributed as dist
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, mel_num=80,spec_crop_len=1248,mode='pad',pad_value=-5,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.max_batch_len = spec_crop_len
+        self.min_batch_len = 64
+        self.mel_num = mel_num
+        self.min_factor = 4
+        self.drop = drop
+        self.pad_value = pad_value
+        assert mode in ['pad','tile']
+        self.collate_mode = mode
+        # print(f"################# self.collate_mode {self.collate_mode} ##################")
+        manifest_files = []
+        for dir_path in spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        df = pd.concat(df_list,ignore_index=True)
+        if split == 'train':
+            self.dataset = df.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = df.iloc[:100]
+        elif split == 'test':
+            df = self.add_name_num(df)
+            self.dataset = df
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def ordered_indices(self):
+        index2dur = self.dataset[['duration']]
+        index2dur = index2dur.sort_values(by='duration')
+        return list(index2dur.index)
+    def __getitem__(self, idx):
+        item = {}
+        data = self.dataset.iloc[idx]
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, 624]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.ones((self.mel_num,self.min_batch_len)).astype(np.float32)*self.pad_value
+        item['image'] = spec
+        p = np.random.uniform(0,1)
+        if p > self.drop:
+            ori_caption = data['caption']
+            struct_caption = f'<{ori_caption}& all>'
+        else:
+            ori_caption = ""
+            struct_caption = ""
+        item["caption"] = {"ori_caption":ori_caption,"struct_caption":struct_caption}
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        # item['f_name'] = data['mel_path']
+        return item
+    def collater(self,inputs):
+        to_dict = {}
+        for l in inputs:
+            for k,v in l.items():
+                if k in to_dict:
+                    to_dict[k].append(v)
+                else:
+                    to_dict[k] = [v]
+        if self.collate_mode == 'pad':
+            to_dict['image'] = collate_1d_or_2d(to_dict['image'],pad_idx=self.pad_value,min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        elif self.collate_mode == 'tile':
+            to_dict['image'] = collate_1d_or_2d_tile(to_dict['image'],min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        else:
+            raise NotImplementedError
+        to_dict['caption'] = {'ori_caption':[c['ori_caption'] for c in to_dict['caption']],
+                              'struct_caption':[c['struct_caption'] for c in to_dict['caption']]}
+        return to_dict
+    def __len__(self):
+        return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class JoinSpecsDebug(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+        self.dataset = self.dataset.iloc[:37]
+class DDPIndexBatchSampler(Sampler):# 让长度相似的音频的indices合到��个batch中以避免过长的pad
+    def __init__(self, indices ,batch_size, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False) -> None:
+        if num_replicas is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                print("Not in distributed mode")
+                num_replicas = 1
+            else:
+                num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                rank = 0
+            else:
+                rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1))
+        self.indices = indices
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.batch_size = batch_size
+        self.batches = self.build_batches()
+        print(f"rank: {self.rank}, batches_num {len(self.batches)}")
+        # If the dataset length is evenly divisible by replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.batches) % self.num_replicas != 0:
+            self.batches = self.batches[:len(self.batches)//self.num_replicas*self.num_replicas]
+        if len(self.batches) > self.num_replicas:
+            self.batches = self.batches[self.rank::self.num_replicas]
+        else: # may happen in sanity checking
+            self.batches = [self.batches[0]]
+        print(f"after split batches_num {len(self.batches)}")
+        self.shuffle = shuffle
+        if self.shuffle:
+            self.batches = np.random.permutation(self.batches)
+        self.seed = seed
+    def set_epoch(self,epoch):
+        self.epoch = epoch
+        if self.shuffle:
+            np.random.seed(self.seed+self.epoch)
+            self.batches = np.random.permutation(self.batches)
+    def build_batches(self):
+        batches,batch = [],[]
+        for index in self.indices:
+            batch.append(index)
+            if len(batch) == self.batch_size:
+                batches.append(batch)
+                batch = []
+        if not self.drop_last and len(batch) > 0:
+            batches.append(batch)
+        return batches
+    def __iter__(self) -> Iterator[List[int]]:
+        for batch in self.batches:
+            yield batch
+    def __len__(self) -> int:
+        return len(self.batches)
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+def collate_1d_or_2d(values, pad_idx=0, left_pad=False, shift_right=False,min_len = None, max_len=None,min_factor=None, shift_id=1):
+    if len(values[0].shape) == 1:
+        return collate_1d(values, pad_idx, left_pad, shift_right,min_len, max_len,min_factor, shift_id)
+    else:
+        return collate_2d(values, pad_idx, left_pad, shift_right,min_len,max_len,min_factor)
+def collate_1d(values, pad_idx=0, left_pad=False, shift_right=False,min_len=None, max_len=None,min_factor=None, shift_id=1):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    res = values[0].new(len(values), size).fill_(pad_idx)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+            dst[0] = shift_id
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
+    return res
+def collate_2d(values, pad_idx=0, left_pad=False, shift_right=False, min_len=None,max_len=None,min_factor=None):
+    """Collate 2d for melspec,Convert a list of 2d tensors into a padded 3d tensor,pad in mel_length dimension.
+        values[0] shape: (melbins,mel_length)
+    """
+    size = max(v.shape[1] for v in values) # if max_len is None else max_len
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    if isinstance(values,np.ndarray):
+        values = torch.FloatTensor(values)
+    if isinstance(values,list):
+        values = [torch.FloatTensor(v) for v in values]
+    res = torch.ones(len(values), values[0].shape[0],size).to(dtype=torch.float32)*pad_idx
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v[:,:size], res[i][:,size - v.shape[1]:] if left_pad else res[i][:,:v.shape[1]])
+    return res
+def collate_1d_or_2d_tile(values, shift_right=False,min_len = None, max_len=None,min_factor=None, shift_id=1):
+    if len(values[0].shape) == 1:
+        return collate_1d_tile(values, shift_right,min_len, max_len,min_factor, shift_id)
+    else:
+        return collate_2d_tile(values, shift_right,min_len,max_len,min_factor)
+def collate_1d_tile(values, shift_right=False,min_len=None, max_len=None,min_factor=None,shift_id=1):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size%min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    res = values[0].new(len(values), size)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel(), f"dst shape:{dst.shape} src shape:{src.shape}"
+        if shift_right:
+            dst[1:] = src[:-1]
+            dst[0] = shift_id
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        n_repeat = math.ceil((size + 1) / v.shape[0])
+        v = torch.tile(v,dims=(1,n_repeat))[:size]
+        copy_tensor(v, res[i])
+    return res
+def collate_2d_tile(values, shift_right=False, min_len=None,max_len=None,min_factor=None):
+    """Collate 2d for melspec,Convert a list of 2d tensors into a padded 3d tensor,pad in mel_length dimension. """
+    size = max(v.shape[1] for v in values) # if max_len is None else max_len
+    if max_len:
+        size = min(size,max_len)
+    if min_len:
+        size = max(size,min_len)
+    if min_factor and (size % min_factor!=0):# size must be the multiple of min_factor
+        size += (min_factor - size % min_factor)
+    if isinstance(values,np.ndarray):
+        values = torch.FloatTensor(values)
+    if isinstance(values,list):
+        values = [torch.FloatTensor(v) for v in values]
+    res = torch.zeros(len(values), values[0].shape[0],size).to(dtype=torch.float32)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if shift_right:
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        n_repeat = math.ceil((size + 1) / v.shape[1])
+        v = torch.tile(v,dims=(1,n_repeat))[:,:size]
+        copy_tensor(v, res[i])
+    return res

ldm/data/joinaudiodataset_struct_sample.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+import numpy as np
+import torch
+import logging
+import pandas as pd
+import glob
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, main_spec_dir_path,other_spec_dir_path, mel_num=None, spec_crop_len=None,pad_value=-5,**kwargs):
+        super().__init__()
+        self.main_prob = 0.5
+        self.split = split
+        self.batch_max_length = spec_crop_len
+        self.batch_min_length = 50
+        self.mel_num = mel_num
+        self.pad_value = pad_value
+        manifest_files = []
+        for dir_path in main_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        self.df_main = pd.concat(df_list,ignore_index=True)
+        manifest_files = []
+        for dir_path in other_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        self.df_other = pd.concat(df_list,ignore_index=True)
+        if split == 'train':
+            self.dataset = self.df_main.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = self.df_main.iloc[:100]
+        elif split == 'test':
+            self.df_main = self.add_name_num(self.df_main)
+            self.dataset = self.df_main
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def __getitem__(self, idx):
+        if np.random.uniform(0,1) < self.main_prob:
+            data = self.dataset.iloc[idx]
+            ori_caption = data['ori_cap']
+            struct_caption = data['caption']
+        else:
+            randidx = np.random.randint(0,len(self.df_other))
+            data = self.df_other.iloc[randidx]
+            ori_caption = data['caption']
+            struct_caption = f'<{ori_caption}, all>'
+        item = {}
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, 624]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.ones((self.mel_num,self.batch_max_length)).astype(np.float32)*self.pad_value
+        if spec.shape[1] <= self.batch_max_length:
+            spec = np.pad(spec, ((0, 0), (0, self.batch_max_length - spec.shape[1])),mode='constant',constant_values = (self.pad_value,self.pad_value)) # [80, 624]
+        item['image'] = spec[:self.mel_num,:self.batch_max_length]
+        item["caption"] = {"ori_caption":ori_caption,"struct_caption":struct_caption}
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)

ldm/data/joinaudiodataset_struct_sample_anylen.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import sys
+import numpy as np
+import torch
+from typing import TypeVar, Optional, Iterator
+import logging
+import pandas as pd
+from ldm.data.joinaudiodataset_anylen import *
+import glob
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, main_spec_dir_path,other_spec_dir_path, mel_num=80,mode='pad', spec_crop_len=1248,pad_value=-5,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.max_batch_len = spec_crop_len
+        self.min_batch_len = 64
+        self.min_factor = 4
+        self.mel_num = mel_num
+        self.drop = drop
+        self.pad_value = pad_value
+        assert mode in ['pad','tile']
+        self.collate_mode = mode
+        manifest_files = []
+        for dir_path in main_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        self.df_main = pd.concat(df_list,ignore_index=True)
+        manifest_files = []
+        for dir_path in other_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        # import ipdb
+        # ipdb.set_trace()
+        self.df_other = pd.concat(df_list,ignore_index=True)
+        self.df_other.reset_index(inplace=True)
+        if split == 'train':
+            self.dataset = self.df_main.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = self.df_main.iloc[:100]
+        elif split == 'test':
+            self.df_main = self.add_name_num(self.df_main)
+            self.dataset = self.df_main
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset),"drop_rate",self.drop)
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = str(df.loc[t[0],'name']) + f'_{t[1]}'
+        return df
+    def ordered_indices(self):
+        index2dur = self.dataset[['duration']].sort_values(by='duration')
+        index2dur_other = self.df_other[['duration']].sort_values(by='duration')
+        other_indices = list(index2dur_other.index)
+        offset = len(self.dataset)
+        other_indices = [x + offset for x in other_indices]
+        return list(index2dur.index),other_indices
+        # return list(index2dur.index)
+    def collater(self,inputs):
+        to_dict = {}
+        for l in inputs:
+            for k,v in l.items():
+                if k in to_dict:
+                    to_dict[k].append(v)
+                else:
+                    to_dict[k] = [v]
+        if self.collate_mode == 'pad':
+            to_dict['image'] = collate_1d_or_2d(to_dict['image'],pad_idx=self.pad_value,min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        elif self.collate_mode == 'tile':
+            to_dict['image'] = collate_1d_or_2d_tile(to_dict['image'],min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        else:
+            raise NotImplementedError
+        to_dict['caption'] = {'ori_caption':[c['ori_caption'] for c in to_dict['caption']],
+                              'struct_caption':[c['struct_caption'] for c in to_dict['caption']]}
+        return to_dict
+    def __getitem__(self, idx):
+        if idx < len(self.dataset):
+            data = self.dataset.iloc[idx]
+        # p = np.random.uniform(0,1)
+        # if p > self.drop:
+            ori_caption = data['ori_cap']
+            struct_caption = data['caption']
+        # else:
+        #     ori_caption = ""
+        #     struct_caption = ""
+        else:
+            data = self.df_other.iloc[idx-len(self.dataset)]
+            # p = np.random.uniform(0,1)
+            # if p > self.drop:
+            ori_caption = data['caption']
+            struct_caption = f'<{ori_caption}& all>'
+            # else:
+            #     ori_caption = ""
+            #     struct_caption = ""
+        item = {}
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, T]
+            if spec.shape[1] > self.max_batch_len:
+                spec = spec[:,:self.max_batch_len]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.ones((self.mel_num,self.min_batch_len)).astype(np.float32)*self.pad_value
+        item['image'] = spec
+        item["caption"] = {"ori_caption":ori_caption,"struct_caption":struct_caption}
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset) + len(self.df_other)
+        # return len(self.dataset)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class DDPIndexBatchSampler(Sampler):# 让长度相似的音频的indices合到一个batch中以避免过长的pad
+    def __init__(self, main_indices,other_indices,batch_size, num_replicas: Optional[int] = None,
+    # def __init__(self, main_indices,batch_size, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False) -> None:
+        if num_replicas is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                print("Not in distributed mode")
+                num_replicas = 1
+            else:
+                num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                rank = 0
+            else:
+                rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1))
+        self.main_indices = main_indices
+        self.other_indices = other_indices
+        self.max_index = max(self.other_indices)
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.batches = self.build_batches()
+        self.seed = seed
+    def set_epoch(self,epoch):
+        # print("!!!!!!!!!!!set epoch is called!!!!!!!!!!!!!!")
+        self.epoch = epoch
+        if self.shuffle:
+            np.random.seed(self.seed+self.epoch)
+            self.batches = self.build_batches()
+    def build_batches(self):
+        batches,batch = [],[]
+        for index in self.main_indices:
+            batch.append(index)
+            if len(batch) == self.batch_size:
+                batches.append(batch)
+                batch = []
+        if not self.drop_last and len(batch) > 0:
+            batches.append(batch)
+        selected_others = np.random.choice(len(self.other_indices),len(batches),replace=False)
+        for index in selected_others:
+            if index + self.batch_size > len(self.other_indices):
+                index = len(self.other_indices) - self.batch_size
+            batch = [self.other_indices[index + i] for i in range(self.batch_size)]
+            batches.append(batch)
+        self.batches = batches
+        if self.shuffle:
+            self.batches = np.random.permutation(self.batches)
+        if self.rank == 0:
+            print(f"rank: {self.rank}, batches_num {len(self.batches)}")
+        if self.drop_last and len(self.batches) % self.num_replicas != 0:
+            self.batches = self.batches[:len(self.batches)//self.num_replicas*self.num_replicas]
+        if len(self.batches) >= self.num_replicas:
+            self.batches = self.batches[self.rank::self.num_replicas]
+        else: # may happen in sanity checking
+            self.batches = [self.batches[0]]
+        if self.rank == 0:
+            print(f"after split batches_num {len(self.batches)}")
+        return self.batches
+    def __iter__(self) -> Iterator[List[int]]:
+        print(f"len(self.batches):{len(self.batches)}")
+        for batch in self.batches:
+            yield batch
+    def __len__(self) -> int:
+        return len(self.batches)

ldm/data/preprocess/NAT_mel.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+import torch
+import torch.nn as nn
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log10(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log10(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+class MelNet(nn.Module):
+    def __init__(self,hparams,device='cpu') -> None:
+        super().__init__()
+        self.n_fft = hparams['fft_size']
+        self.num_mels = hparams['audio_num_mel_bins']
+        self.sampling_rate = hparams['audio_sample_rate']
+        self.hop_size = hparams['hop_size']
+        self.win_size = hparams['win_size']
+        self.fmin = hparams['fmin']
+        self.fmax = hparams['fmax']
+        self.device = device
+        mel = librosa_mel_fn(self.sampling_rate, self.n_fft, self.num_mels, self.fmin, self.fmax)
+        self.mel_basis = torch.from_numpy(mel).float().to(self.device)
+        self.hann_window = torch.hann_window(self.win_size).to(self.device)
+    def to(self,device,**kwagrs):
+        super().to(device=device,**kwagrs)
+        self.mel_basis = self.mel_basis.to(device)
+        self.hann_window = self.hann_window.to(device)
+        self.device = device
+    def forward(self,y,center=False, complex=False):
+        if isinstance(y,np.ndarray):
+            y = torch.FloatTensor(y)
+            if len(y.shape) == 1:
+                y = y.unsqueeze(0)
+        y = y.clamp(min=-1., max=1.).to(self.device)
+        y = torch.nn.functional.pad(y.unsqueeze(1), [int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)],
+                                    mode='reflect')
+        y = y.squeeze(1)
+        spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
+                        center=center, pad_mode='reflect', normalized=False, onesided=True,return_complex=complex)
+        if not complex:
+            spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+            spec = torch.matmul(self.mel_basis, spec)
+            spec = spectral_normalize_torch(spec)
+        else:
+            B, C, T, _ = spec.shape
+            spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
+        return spec
+## below can be used in one gpu, but not ddp
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, hparams, center=False, complex=False): # y should be a tensor with shape (b,wav_len)
+    # hop_size: 512  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+    # win_size: 2048  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+    # fmin: 55  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+    # fmax: 10000  # To be increased/reduced depending on data.
+    # fft_size: 2048  # Extra window size is filled with 0 paddings to match this parameter
+    # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
+    n_fft = hparams['fft_size']
+    num_mels = hparams['audio_num_mel_bins']
+    sampling_rate = hparams['audio_sample_rate']
+    hop_size = hparams['hop_size']
+    win_size = hparams['win_size']
+    fmin = hparams['fmin']
+    fmax = hparams['fmax']
+    if isinstance(y,np.ndarray):
+        y = torch.FloatTensor(y)
+        if len(y.shape) == 1:
+            y = y.unsqueeze(0)
+    y = y.clamp(min=-1., max=1.)
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), [int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)],
+                                mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True,return_complex=complex)
+    if not complex:
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+    else:
+        B, C, T, _ = spec.shape
+        spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
+    return spec

ldm/data/preprocess/__pycache__/NAT_mel.cpython-38.pyc ADDED Viewed

Binary file (4.25 kB). View file

ldm/data/preprocess/__pycache__/NAT_mel.cpython-39.pyc ADDED Viewed

Binary file (4.23 kB). View file

ldm/data/preprocess/add_duration.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+import audioread
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map
+def map_duration(tsv_withdur,tsv_toadd):# tsv_withdur 和 tsv_toadd 'name'列相同且tsv_withdur有duration信息，目标是给tsv_toadd的相同行加上duration信息。
+    df1 = pd.read_csv(tsv_withdur,sep='\t')
+    df2 = pd.read_csv(tsv_toadd,sep='\t')
+    df = df2.merge(df1,on=['name'],suffixes=['','_y'])
+    dropset = list(set(df.columns) - set(df1.columns))
+    df = df.drop(dropset,axis=1)
+    df.to_csv(tsv_toadd,sep='\t',index=False)
+    return df
+def add_duration(args):
+    index,audiopath = args
+    try:
+        with audioread.audio_open(audiopath) as f:
+            totalsec = f.duration
+    except:
+        totalsec = -1
+    return (index,totalsec)
+def add_dur2tsv(tsv_path,save_path):
+    df = pd.read_csv(tsv_path,sep='\t')
+    item_list = []
+    for item in tqdm(df.itertuples()):
+        item_list.append((item[0],getattr(item,'audio_path')))
+    r = process_map(add_duration,item_list,max_workers=16,chunksize=32)
+    index2dur = {}
+    for index,dur in r:
+        if dur == -1:
+            bad_wav  = df.loc[index,'audio_path']
+            print(f'bad wav:{bad_wav}')
+        index2dur[index] = dur
+    df['duration'] = df.index.map(index2dur)
+    df.to_csv(save_path,sep='\t',index=False)
+if __name__ == '__main__':
+    add_dur2tsv('/root/autodl-tmp/liuhuadai/AudioLCM/now.tsv','/root/autodl-tmp/liuhuadai/AudioLCM/now_duration.tsv')
+    #map_duration(tsv_withdur='tsv_maker/filter_audioset.tsv',
+    #              tsv_toadd='MAA1 Dataset tsvs/V3/refilter_audioset.tsv')

ldm/data/preprocess/mel_spec.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from ldm.data.preprocess.NAT_mel import MelNet
+import os
+from tqdm import tqdm
+from glob import glob
+import math
+import pandas as pd
+import logging
+import math
+import audioread
+from tqdm.contrib.concurrent import process_map
+import torch
+import torch.nn as nn
+import torchaudio
+import numpy as np
+from torch.distributed import init_process_group
+from torch.utils.data import Dataset,DataLoader,DistributedSampler
+import torch.multiprocessing as mp
+from argparse import Namespace
+from multiprocessing import Pool
+import json
+class tsv_dataset(Dataset):
+    def __init__(self,tsv_path,sr,mode='none',hop_size = None,target_mel_length = None) -> None:
+        super().__init__()
+        if os.path.isdir(tsv_path):
+            files = glob(os.path.join(tsv_path,'*.tsv'))
+            df = pd.concat([pd.read_csv(file,sep='\t') for file in files])
+        else:
+            df = pd.read_csv(tsv_path,sep='\t')
+        self.audio_paths = []
+        self.sr = sr
+        self.mode = mode
+        self.target_mel_length = target_mel_length
+        self.hop_size = hop_size
+        for t in tqdm(df.itertuples()):
+            self.audio_paths.append(getattr(t,'audio_path'))
+    def __len__(self):
+        return len(self.audio_paths)
+    def pad_wav(self,wav):
+        # wav should be in shape(1,wav_len)
+        wav_length = wav.shape[-1]
+        assert wav_length > 100, "wav is too short, %s" % wav_length
+        segment_length = (self.target_mel_length + 1) * self.hop_size  # final mel will crop the last mel, mel = mel[:,:-1]
+        if segment_length is None or wav_length == segment_length:
+            return wav
+        elif wav_length > segment_length:
+            return wav[:,:segment_length]
+        elif wav_length < segment_length:
+            temp_wav = torch.zeros((1, segment_length),dtype=torch.float32)
+            temp_wav[:, :wav_length] = wav
+        return temp_wav
+    def __getitem__(self, index):
+        audio_path = self.audio_paths[index]
+        wav, orisr = torchaudio.load(audio_path)
+        if wav.shape[0] != 1: # stereo to mono  (2,wav_len) -> (1,wav_len)
+            wav = wav.mean(0,keepdim=True)
+        wav = torchaudio.functional.resample(wav, orig_freq=orisr, new_freq=self.sr)
+        if self.mode == 'pad':
+            assert self.target_mel_length is not None
+            wav = self.pad_wav(wav)
+        return audio_path,wav
+def process_audio_by_tsv(rank,args):
+    if args.num_gpus > 1:
+        init_process_group(backend=args.dist_config['dist_backend'], init_method=args.dist_config['dist_url'],
+                            world_size=args.dist_config['world_size'] * args.num_gpus, rank=rank)
+    sr = args.audio_sample_rate
+    dataset = tsv_dataset(args.tsv_path,sr = sr,mode=args.mode,hop_size=args.hop_size,target_mel_length=args.batch_max_length)
+    sampler = DistributedSampler(dataset,shuffle=False) if args.num_gpus > 1 else None
+    # batch_size must == 1,since wav_len is not equal
+    loader = DataLoader(dataset, sampler=sampler,batch_size=1, num_workers=16,drop_last=False)
+    device = torch.device('cuda:{:d}'.format(rank))
+    mel_net = MelNet(args.__dict__)
+    mel_net.to(device)
+    # if args.num_gpus > 1: # RuntimeError: DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.
+    #     mel_net = DistributedDataParallel(mel_net, device_ids=[rank]).to(device)
+    loader = tqdm(loader) if rank == 0 else loader
+    for batch in loader:
+        audio_paths,wavs = batch
+        wavs = wavs.to(device)
+        if args.save_resample:
+            for audio_path,wav in zip(audio_paths,wavs):
+                psplits = audio_path.split('/')
+                root,wav_name = psplits[0],psplits[-1]
+                # save resample
+                resample_root,resample_name = root+f'_{sr}',wav_name[:-4]+'_audio.npy'
+                resample_dir_name = os.path.join(resample_root,*psplits[1:-1])
+                resample_path = os.path.join(resample_dir_name,resample_name)
+                os.makedirs(resample_dir_name,exist_ok=True)
+                np.save(resample_path,wav.cpu().numpy().squeeze(0))
+        if args.save_mel:
+            mode = args.mode
+            batch_max_length = args.batch_max_length
+            for audio_path,wav in zip(audio_paths,wavs):
+                psplits = audio_path.split('/')
+                root,wav_name = psplits[0],psplits[-1]
+                mel_root,mel_name = root+f'_mel{mode}{sr}nfft{args.fft_size}',wav_name[:-4]+'_mel.npy'
+                mel_dir_name = os.path.join(mel_root,*psplits[1:-1])
+                mel_path = os.path.join(mel_dir_name,mel_name)
+                if not os.path.exists(mel_path):
+                    mel_spec = mel_net(wav).cpu().numpy().squeeze(0) # (mel_bins,mel_len)
+                    if mel_spec.shape[1] <= batch_max_length:
+                        if mode == 'tile': # pad is done in dataset as pad wav
+                            n_repeat = math.ceil((batch_max_length + 1) / mel_spec.shape[1])
+                            mel_spec = np.tile(mel_spec,reps=(1,n_repeat))
+                        elif mode == 'none' or mode == 'pad':
+                            pass
+                        else:
+                            raise ValueError(f'mode:{mode} is not supported')
+                    mel_spec = mel_spec[:,:batch_max_length]
+                    os.makedirs(mel_dir_name,exist_ok=True)
+                    np.save(mel_path,mel_spec)
+def split_list(i_list,num):
+    each_num = math.ceil(i_list / num)
+    result = []
+    for i in range(num):
+        s = each_num * i
+        e = (each_num * (i+1))
+        result.append(i_list[s:e])
+    return result
+def drop_bad_wav(item):
+    index,path = item
+    try:
+        with audioread.audio_open(path) as f:
+            totalsec = f.duration
+            if totalsec < 0.1:
+                return index # index
+    except:
+        print(f"corrupted wav:{path}")
+        return index
+    return False
+def drop_bad_wavs(tsv_path):# 'audioset.csv'
+    df = pd.read_csv(tsv_path,sep='\t')
+    item_list = []
+    for item in tqdm(df.itertuples()):
+        item_list.append((item[0],getattr(item,'audio_path')))
+    r = process_map(drop_bad_wav,item_list,max_workers=16,chunksize=16)
+    bad_indices = list(filter(lambda x:x!= False,r))
+    print(bad_indices)
+    with open('bad_wavs.json','w') as f:
+        x = [item_list[i] for i in bad_indices]
+        json.dump(x,f)
+    df = df.drop(bad_indices,axis=0)
+    df.to_csv(tsv_path,sep='\t',index=False)
+if __name__ == '__main__':
+    logging.basicConfig(filename='example.log',  level=logging.INFO,
+        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
+    tsv_path = './musiccap.tsv'
+    if os.path.isdir(tsv_path):
+        files = glob(os.path.join(tsv_path,'*.tsv'))
+        for file in files:
+            drop_bad_wavs(file)
+    else:
+        drop_bad_wavs(tsv_path)
+    num_gpus = 1
+    args = {
+        'audio_sample_rate': 16000,
+        'audio_num_mel_bins':80,
+        'fft_size': 1024,# 4000:512 ,16000:1024,
+        'win_size': 1024,
+        'hop_size': 256,
+        'fmin': 0,
+        'fmax': 8000,
+        'batch_max_length': 1560, # 4000:312 (nfft = 512,hoplen=128,mellen = 313), 16000:624 , 22050:848 #
+        'tsv_path': tsv_path,
+        'num_gpus': num_gpus,
+        'mode': 'none',
+        'save_resample':False,
+        'save_mel' :True
+    }
+    args = Namespace(**args)
+    args.dist_config = {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54189",
+        "world_size": 1
+    }
+    if args.num_gpus>1:
+        mp.spawn(process_audio_by_tsv,nprocs=args.num_gpus,args=(args,))
+    else:
+        process_audio_by_tsv(0,args=args)
+    print("done")

ldm/data/test.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import sys
+import numpy as np
+import torch
+from typing import TypeVar, Optional, Iterator
+import logging
+import pandas as pd
+from ldm.data.joinaudiodataset_anylen import *
+import glob
+logger = logging.getLogger(f'main.{__name__}')
+sys.path.insert(0, '.')  # nopep8
+class JoinManifestSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, main_spec_dir_path,other_spec_dir_path, mel_num=80,mode='pad', spec_crop_len=1248,pad_value=-5,drop=0,**kwargs):
+        super().__init__()
+        self.split = split
+        self.max_batch_len = spec_crop_len
+        self.min_batch_len = 64
+        self.min_factor = 4
+        self.mel_num = mel_num
+        self.drop = drop
+        self.pad_value = pad_value
+        assert mode in ['pad','tile']
+        self.collate_mode = mode
+        manifest_files = []
+        for dir_path in main_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        self.df_main = pd.concat(df_list,ignore_index=True)
+        manifest_files = []
+        for dir_path in other_spec_dir_path.split(','):
+            manifest_files += glob.glob(f'{dir_path}/*.tsv')
+        df_list = [pd.read_csv(manifest,sep='\t') for manifest in manifest_files]
+        self.df_other = pd.concat(df_list,ignore_index=True)
+        self.df_other.reset_index(inplace=True)
+        if split == 'train':
+            self.dataset = self.df_main.iloc[100:]
+        elif split == 'valid' or split == 'val':
+            self.dataset = self.df_main.iloc[:100]
+        elif split == 'test':
+            self.df_main = self.add_name_num(self.df_main)
+            self.dataset = self.df_main
+        else:
+            raise ValueError(f'Unknown split {split}')
+        self.dataset.reset_index(inplace=True)
+        print('dataset len:', len(self.dataset),"drop_rate",self.drop)
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = str(df.loc[t[0],'name']) + f'_{t[1]}'
+        return df
+    def ordered_indices(self):
+        index2dur = self.dataset[['duration']].sort_values(by='duration')
+        index2dur_other = self.df_other[['duration']].sort_values(by='duration')
+        other_indices = list(index2dur_other.index)
+        offset = len(self.dataset)
+        other_indices = [x + offset for x in other_indices]
+        return list(index2dur.index),other_indices
+    def collater(self,inputs):
+        to_dict = {}
+        for l in inputs:
+            for k,v in l.items():
+                if k in to_dict:
+                    to_dict[k].append(v)
+                else:
+                    to_dict[k] = [v]
+        if self.collate_mode == 'pad':
+            to_dict['image'] = collate_1d_or_2d(to_dict['image'],pad_idx=self.pad_value,min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        elif self.collate_mode == 'tile':
+            to_dict['image'] = collate_1d_or_2d_tile(to_dict['image'],min_len = self.min_batch_len,max_len=self.max_batch_len,min_factor=self.min_factor)
+        else:
+            raise NotImplementedError
+        to_dict['caption'] = {'ori_caption':[c['ori_caption'] for c in to_dict['caption']],
+                              'struct_caption':[c['struct_caption'] for c in to_dict['caption']]}
+        return to_dict
+    def __getitem__(self, idx):
+        if idx < len(self.dataset):
+            data = self.dataset.iloc[idx]
+            p = np.random.uniform(0,1)
+            if p > self.drop:
+                ori_caption = data['ori_cap']
+                struct_caption = data['caption']
+            else:
+                ori_caption = ""
+                struct_caption = ""
+        else:
+            data = self.df_other.iloc[idx-len(self.dataset)]
+            p = np.random.uniform(0,1)
+            if p > self.drop:
+                ori_caption = data['caption']
+                struct_caption = f'<{ori_caption}& all>'
+            else:
+                ori_caption = ""
+                struct_caption = ""
+        item = {}
+        try:
+            spec = np.load(data['mel_path']) # mel spec [80, T]
+            if spec.shape[1] > self.max_batch_len:
+                spec = spec[:,:self.max_batch_len]
+        except:
+            mel_path = data['mel_path']
+            print(f'corrupted:{mel_path}')
+            spec = np.ones((self.mel_num,self.min_batch_len)).astype(np.float32)*self.pad_value
+        item['image'] = spec
+        item["caption"] = {"ori_caption":ori_caption,"struct_caption":struct_caption}
+        if self.split == 'test':
+            item['f_name'] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset) + len(self.df_other)
+class JoinSpecsTrain(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class JoinSpecsValidation(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('valid', **specs_dataset_cfg)
+class JoinSpecsTest(JoinManifestSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class DDPIndexBatchSampler(Sampler):# 让长度相似的音频的indices合到一个batch中以避免过长的pad
+    def __init__(self, main_indices,other_indices,batch_size, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False) -> None:
+        if num_replicas is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                print("Not in distributed mode")
+                num_replicas = 1
+            else:
+                num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_initialized():
+                # raise RuntimeError("Requires distributed package to be available")
+                rank = 0
+            else:
+                rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1))
+        self.main_indices = main_indices
+        self.other_indices = other_indices
+        self.max_index = max(self.other_indices)
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.batches = self.build_batches()
+        self.seed = seed
+    def set_epoch(self,epoch):
+        # print("!!!!!!!!!!!set epoch is called!!!!!!!!!!!!!!")
+        self.epoch = epoch
+        if self.shuffle:
+            np.random.seed(self.seed+self.epoch)
+            self.batches = self.build_batches()
+    def build_batches(self):
+        batches,batch = [],[]
+        for index in self.main_indices:
+            batch.append(index)
+            if len(batch) == self.batch_size:
+                batches.append(batch)
+                batch = []
+        if not self.drop_last and len(batch) > 0:
+            batches.append(batch)
+        selected_others = np.random.choice(len(self.other_indices),len(batches),replace=False)
+        for index in selected_others:
+            if index + self.batch_size > len(self.other_indices):
+                index = len(self.other_indices) - self.batch_size
+            batch = [self.other_indices[index + i] for i in range(self.batch_size)]
+            batches.append(batch)
+        self.batches = batches
+        if self.shuffle:
+            self.batches = np.random.permutation(self.batches)
+        if self.rank == 0:
+            print(f"rank: {self.rank}, batches_num {len(self.batches)}")
+        if self.drop_last and len(self.batches) % self.num_replicas != 0:
+            self.batches = self.batches[:len(self.batches)//self.num_replicas*self.num_replicas]
+        if len(self.batches) >= self.num_replicas:
+            self.batches = self.batches[self.rank::self.num_replicas]
+        else: # may happen in sanity checking
+            self.batches = [self.batches[0]]
+        if self.rank == 0:
+            print(f"after split batches_num {len(self.batches)}")
+        return self.batches
+    def __iter__(self) -> Iterator[List[int]]:
+        print(f"len(self.batches):{len(self.batches)}")
+        for batch in self.batches:
+            yield batch
+    def __len__(self) -> int:
+        return len(self.batches)

ldm/data/tsv_dirs/full_data/V1_new/audiocaps_train_16000.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/V2/MACS.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/V2/WavText5K.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/V2/adobe.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/V2/audiostock.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/V2/epidemic_sound.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc67e42c9defa98edfc2c6b23c731fafa4a22307fddfd1fb95ccfc00d0168951
+size 15062608

ldm/data/tsv_dirs/full_data/caps_struct/audiocaps_train_16000_struct2.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsv_dirs/full_data/clotho.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

ldm/data/tsvdataset.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from glob import glob
+from torch.utils.data import Dataset
+import numpy as np
+import pandas as pd
+class TSVDataset(Dataset):
+    def __init__(self, tsv_path, spec_crop_len=None):
+        super().__init__()
+        self.batch_max_length = spec_crop_len
+        self.batch_min_length = 50
+        df = pd.read_csv(tsv_path,sep='\t')
+        df = self.add_name_num(df)
+        self.dataset = df
+        print('dataset len:', len(self.dataset))
+    def add_name_num(self,df):
+        """each file may have different caption, we add num to filename to identify each audio-caption pair"""
+        name_count_dict = {}
+        change = []
+        for t in df.itertuples():
+            name = getattr(t,'name')
+            if name in name_count_dict:
+                name_count_dict[name] += 1
+            else:
+                name_count_dict[name] = 0
+            change.append((t[0],name_count_dict[name]))
+        for t in change:
+            df.loc[t[0],'name'] = df.loc[t[0],'name'] + f'_{t[1]}'
+        return df
+    def __getitem__(self, idx):
+        data = self.dataset.iloc[idx]
+        item = {}
+        spec = np.load(data['mel_path']) # mel spec [80, 624]
+        if spec.shape[1] <= self.batch_max_length:
+            spec = np.pad(spec, ((0, 0), (0, self.batch_max_length - spec.shape[1]))) # [80, 624]
+        item['image'] = spec
+        item["caption"] = data['caption']
+        item["f_name"] = data['name']
+        return item
+    def __len__(self):
+        return len(self.dataset)
+class TSVDatasetStruct(TSVDataset):
+    def __getitem__(self, idx):
+        data = self.dataset.iloc[idx]
+        item = {}
+        spec = np.load(data['mel_path']) # mel spec [80, 624]
+        if spec.shape[1] <= self.batch_max_length:
+            spec = np.pad(spec, ((0, 0), (0, self.batch_max_length - spec.shape[1]))) # [80, 624]
+        item['image'] = spec[:,:self.batch_max_length]
+        item["caption"] = {'ori_caption':data['ori_cap'],'struct_caption':data['caption']}
+        item["f_name"] = data['name']
+        return item
+class TSVDatasetTestFake(TSVDataset):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__(phase='test', **specs_dataset_cfg)
+        self.dataset = [self.dataset[0]]

ldm/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+    def __call__(self, n, **kwargs):
+        return self.schedule(n,**kwargs)
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                    1 + np.cos(t * np.pi))
+            self.last_f = f
+            return f
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f

ldm/models/__pycache__/autoencoder.cpython-37.pyc ADDED Viewed

Binary file (15.6 kB). View file

ldm/models/__pycache__/autoencoder.cpython-38.pyc ADDED Viewed

Binary file (15.5 kB). View file

ldm/models/__pycache__/autoencoder.cpython-39.pyc ADDED Viewed

Binary file (14.9 kB). View file

ldm/models/__pycache__/autoencoder1d.cpython-37.pyc ADDED Viewed

Binary file (13.5 kB). View file

ldm/models/__pycache__/autoencoder1d.cpython-38.pyc ADDED Viewed

Binary file (13.4 kB). View file

ldm/models/__pycache__/autoencoder_multi.cpython-38.pyc ADDED Viewed

Binary file (14.8 kB). View file

ldm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import os
+import torch
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from contextlib import contextmanager
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+from packaging import version
+import numpy as np
+from ldm.modules.diffusionmodules.model import Encoder, Decoder
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from torch.optim.lr_scheduler import LambdaLR
+from ldm.util import instantiate_from_config
+from icecream import ic
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 batch_resize_range=None,
+                 scheduler_config=None,
+                 lr_g_factor=1.0,
+                 remap=None,
+                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
+                 use_ema=False
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_embed = n_embed
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap,
+                                        sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.batch_resize_range = batch_resize_range
+        if self.batch_resize_range is not None:
+            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input, return_pred_indices=False):
+        quant, diff, (_,_,ind) = self.encode(input)
+        dec = self.decode(quant)
+        if return_pred_indices:
+            return dec, diff, ind
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        if self.batch_resize_range is not None:
+            lower_size = self.batch_resize_range[0]
+            upper_size = self.batch_resize_range[1]
+            if self.global_step <= 4:
+                # do the first few batches with max size to avoid later oom
+                new_resize = upper_size
+            else:
+                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
+            if new_resize != x.shape[2]:
+                x = F.interpolate(x, size=new_resize, mode="bicubic")
+            x = x.detach()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # https://github.com/pytorch/pytorch/issues/37142
+        # try not to fool the heuristics
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train",
+                                            predicted_indices=ind)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
+        return log_dict
+    def _validation_step(self, batch, batch_idx, suffix=""):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
+                                        self.global_step,
+                                        last_layer=self.get_last_layer(),
+                                        split="val"+suffix,
+                                        predicted_indices=ind
+                                        )
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
+                                            self.global_step,
+                                            last_layer=self.get_last_layer(),
+                                            split="val"+suffix,
+                                            predicted_indices=ind
+                                            )
+        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log(f"val{suffix}/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log(f"val{suffix}/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        if version.parse(pl.__version__) >= version.parse('1.4.0'):
+            del log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def test_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        reconstructions = (xrec + 1)/2 # to mel scale
+        test_ckpt_path = os.path.basename(self.trainer.tested_ckpt_path)
+        savedir = os.path.join(self.trainer.log_dir,f'output_imgs_{test_ckpt_path}','fake_class')
+        if not os.path.exists(savedir):
+            os.makedirs(savedir)
+        file_names = batch['f_name']
+        # print(f"reconstructions.shape:{reconstructions.shape}",file_names)
+        reconstructions = reconstructions.cpu().numpy().squeeze(1) # squuze channel dim
+        for b in range(reconstructions.shape[0]):
+            vname_num_split_index = file_names[b].rfind('_')# file_names[b]:video_name+'_'+num
+            v_n,num = file_names[b][:vname_num_split_index],file_names[b][vname_num_split_index+1:]
+            save_img_path = os.path.join(savedir,f'{v_n}_sample_{num}.npy')
+            np.save(save_img_path,reconstructions[b])
+        return None
+    def configure_optimizers(self):
+        lr_d = self.learning_rate
+        lr_g = self.lr_g_factor*self.learning_rate
+        print("lr_d", lr_d)
+        print("lr_g", lr_g)
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr_g, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr_d, betas=(0.5, 0.9))
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+                {
+                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+            ]
+            return [opt_ae, opt_disc], scheduler
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if only_inputs:
+            log["inputs"] = x
+            return log
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        if plot_ema:
+            with self.ema_scope():
+                xrec_ema, _ = self(x)
+                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
+                log["reconstructions_ema"] = xrec_ema
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class VQModelInterface(VQModel):
+    def __init__(self, embed_dim, *args, **kwargs):
+        super().__init__(embed_dim=embed_dim, *args, **kwargs)
+        self.embed_dim = embed_dim
+    def encode(self, x):# VQModel的quantize写在encoder里,VQModelInterface则将其写在decoder里
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, h, force_not_quantize=False):
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, emb_loss, info = self.quantize(h)
+        else:
+            quant = h
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.to_1d = False
+        print(f"to_1d is {self.to_1d} in AUTOENCODER")
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        # self.automatic_optimization = False # hjw for debug
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        if self.to_1d and len(x.shape)==3:
+            x = x.unsqueeze(1)
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        if self.to_1d:
+            b,c,h,w = moments.shape
+            moments = moments.reshape(b,c*h,w)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        if self.to_1d:
+            b,c_h,w = z.shape
+            c = self.post_quant_conv.in_channels
+            z = z.reshape(b,c,-1,w)
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(optimizer_idx,log_dict_ae)
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(optimizer_idx,log_dict_disc)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def test_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)# inputs shape:(b,mel_len,T)
+        reconstructions, posterior = self(inputs)# reconstructions:(b,mel_len,T)
+        mse_loss = torch.nn.functional.mse_loss(reconstructions,inputs)
+        self.log('test/mse_loss',mse_loss)
+        test_ckpt_path = os.path.basename(self.trainer.tested_ckpt_path)
+        savedir = os.path.join(self.trainer.log_dir,f'output_imgs_{test_ckpt_path}','fake_class')
+        if batch_idx == 0:
+            print(f"save_path is: {savedir}")
+        if not os.path.exists(savedir):
+            os.makedirs(savedir)
+            print(f"save_path is: {savedir}")
+        file_names = batch['f_name']
+        # print(f"reconstructions.shape:{reconstructions.shape}",file_names)
+        # reconstructions = (reconstructions + 1)/2 # to mel scale
+        reconstructions = reconstructions.cpu().numpy().squeeze(1) # squeeze channel dim
+        for b in range(reconstructions.shape[0]):
+            vname_num_split_index = file_names[b].rfind('_')# file_names[b]:video_name+'_'+num
+            v_n,num = file_names[b][:vname_num_split_index],file_names[b][vname_num_split_index+1:]
+            save_img_path = os.path.join(savedir, f'{v_n}.npy') # f'{v_n}_sample_{num}.npy'   f'{v_n}.npy'
+            np.save(save_img_path,reconstructions[b])
+        return None
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False,save_dir = 'mel_result_ae13_26_debug/fake_class', **kwargs): # 在main.py的on_validation_batch_end中调用
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x