diff --git a/app.py b/app.py index b524136d6a529d525229dc3f47f772bf2249f3ec..3199c6ca9d3b4bcc470e8f58c035b36129f0259f 100644 --- a/app.py +++ b/app.py @@ -6,11 +6,11 @@ import sys # we will clone the repo and install the dependencies # NOTE: Still fixing bugs, not release, do not try :) ! -os.system('pip install -r qa_mdt/requirements.txt') -os.system('pip install xformers==0.0.26.post1') -os.system('pip install torchlibrosa==0.0.9 librosa==0.9.2') -os.system('pip install -q pytorch_lightning==2.1.3 torchlibrosa==0.0.9 librosa==0.9.2 ftfy==6.1.1 braceexpand') -os.system('pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121') +# os.system('pip install -r qa_mdt/requirements.txt') +# os.system('pip install xformers==0.0.26.post1') +# os.system('pip install torchlibrosa==0.0.9 librosa==0.9.2') +# os.system('pip install -q pytorch_lightning==2.1.3 torchlibrosa==0.0.9 librosa==0.9.2 ftfy==6.1.1 braceexpand') +# os.system('pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121') # only then import the necessary modules from qa_mdt from qa_mdt.pipeline import MOSDiffusionPipeline @@ -59,7 +59,7 @@ iface = gr.Interface( ["A modern synthesizer creating futuristic soundscapes."], ["Acoustic ballad with heartfelt lyrics and soft piano."] ], - cache_examples=True + cache_examples="lazy" ) # Launch the Gradio app diff --git a/log/latent_diffusion/qa_mdt/mos_as_token/qa_mdt.yaml b/log/latent_diffusion/qa_mdt/mos_as_token/qa_mdt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e171f8b32bd15d825812c8884612dccaeb67dc27 --- /dev/null +++ b/log/latent_diffusion/qa_mdt/mos_as_token/qa_mdt.yaml @@ -0,0 +1,169 @@ +log_directory: "./log/latent_diffusion" +project: "audioldm" +precision: "high" + +# TODO: change this with your project path +base_root: "./qa_mdt" + +# TODO: change this with your pretrained path +# TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json" +pretrained: + clap_music: "./qa_mdt/checkpoints/clap_music" + flan_t5: "./qa_mdt/checkpoints/flant5" + hifi-gan: "./qa_mdt/checkpoints/hifi-gan/checkpoints" + roberta-base: "./qa_mdt/checkpoints/robertabase" + +# TODO: lmdb dataset that stores pMOS of the training dataset +# while in inference, we don't need it !!! +# while in inference, we don't need it !!! +# while in inference, we don't need it !!! +mos_path: "" + +train_path: + train_lmdb_path: [] # path list of training lmdb folders + +val_path: + val_lmdb_path: [] # path list of training lmdb folders + val_key_path: [] # path list of training lmdb key files + +variables: + sampling_rate: &sampling_rate 16000 + mel_bins: &mel_bins 64 + latent_embed_dim: &latent_embed_dim 8 + latent_t_size: &latent_t_size 256 # TODO might need to change + latent_f_size: &latent_f_size 16 # TODO might need to change + in_channels: &unet_in_channels 8 # TODO might need to change + optimize_ddpm_parameter: &optimize_ddpm_parameter true + optimize_gpt: &optimize_gpt true + warmup_steps: &warmup_steps 2000 + +# we rewrite the dataset so it may not be needed +data: + train: ["audiocaps"] + val: "audiocaps" + test: "audiocaps" + class_label_indices: "audioset_eval_subset" + dataloader_add_ons: ["waveform_rs_48k"] + +step: + validation_every_n_epochs: 10000 + save_checkpoint_every_n_steps: 1000 + # limit_val_batches: 2 + max_steps: 8000000 + save_top_k: 1000 + +preprocessing: + audio: + sampling_rate: *sampling_rate + max_wav_value: 32768.0 + duration: 10.24 + stft: + filter_length: 1024 + hop_length: 160 + win_length: 1024 + mel: + n_mel_channels: *mel_bins + mel_fmin: 0 + mel_fmax: 8000 + +augmentation: + mixup: 0.0 + +model: + target: qa_mdt.audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion + params: + # Autoencoder + first_stage_config: + base_learning_rate: 8.0e-06 + target: qa_mdt.audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL + params: + # TODO: change it with your VAE checkpoint + reload_from_ckpt: "./qa_mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt" + sampling_rate: *sampling_rate + batchsize: 1 + monitor: val/rec_loss + image_key: fbank + subband: 1 + embed_dim: *latent_embed_dim + time_shuffle: 1 + lossconfig: + target: qa_mdt.audioldm_train.losses.LPIPSWithDiscriminator + params: + disc_start: 50001 + kl_weight: 1000.0 + disc_weight: 0.5 + disc_in_channels: 1 + ddconfig: + double_z: true + mel_bins: *mel_bins + z_channels: 8 + resolution: 256 + downsample_time: false + in_channels: 1 + out_ch: 1 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + + # Other parameters + base_learning_rate: 8.0e-5 + warmup_steps: *warmup_steps + optimize_ddpm_parameter: *optimize_ddpm_parameter + sampling_rate: *sampling_rate + batchsize: 16 + linear_start: 0.0015 + linear_end: 0.0195 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + unconditional_prob_cfg: 0.1 + parameterization: eps # [eps, x0, v] + first_stage_key: fbank + latent_t_size: *latent_t_size + latent_f_size: *latent_f_size + channels: *latent_embed_dim + monitor: val/loss_simple_ema + scale_by_std: true + + unet_config: + # TODO: choose your class, Default: MDT_MOS_AS_TOKEN + # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged) + target: qa_mdt.audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN + params: + input_size : [256, 16] + # patch_size: [16,4] + patch_size : [4, 1] + overlap_size: [0, 0] + in_channels : 8 + hidden_size : 1152 + depth : 28 + num_heads : 16 + mlp_ratio : 4.0 + class_dropout_prob : 0.1 + pred_sigma : True + drop_path : 0. + window_size : 0 + window_block_indexes : None + use_rel_pos : False + cond_dim : 1024 + lewei_scale : 1.0 + overlap: [0, 0] + use_cfg: true + mask_ratio: 0.30 + decode_layer: 8 + + cond_stage_config: + crossattn_flan_t5: + cond_stage_key: text + conditioning_key: crossattn + target: qa_mdt.audioldm_train.conditional_models.FlanT5HiddenState + + evaluation_params: + unconditional_guidance_scale: 3.5 + ddim_sampling_steps: 200 + n_candidates_per_samples: 3 \ No newline at end of file diff --git a/qa_mdt/__pycache__/pipeline.cpython-310.pyc b/qa_mdt/__pycache__/pipeline.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f000c0620da2533a4f444e219a452b0adc54a8a Binary files /dev/null and b/qa_mdt/__pycache__/pipeline.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc index 83a1376ed1c87467ff668a47d1a04fa480bc504c..318f3b103bc3aef8b3844c8bb565405195143650 100644 Binary files a/qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc b/qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc index b49a44e1f5ed5bc0a1913d67cd1503d552af772b..57c6edc7baaad46292b520d4dd50151950bfeccb 100644 Binary files a/qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc and b/qa_mdt/audioldm_train/__pycache__/conditional_models.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc b/qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc index 0392cbcbeb0d0cc18c4a50806f0b9aea86f0a645..3af94967936cf266b80fc17ce816b5ea8f7bae15 100644 Binary files a/qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc and b/qa_mdt/audioldm_train/__pycache__/dataset_plugin.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc index 644ee32348f2ff385534d32e3fbdce8567658525..32b6e3b441fd3904024c3a4ca0e129949fd8dbab 100644 Binary files a/qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/losses/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc b/qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc index 488c4e43068cc043b95b3e9be2aa06160310b05b..9bed087d766a64815dfcc7fb9ddddbe7d25cf472 100644 Binary files a/qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc and b/qa_mdt/audioldm_train/losses/__pycache__/contperceptual.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc index 42439fc436a7a6b66505b9040baacbd084bff473..d5e32dc82ee573e2f073678654adae844e912b3a 100644 Binary files a/qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc index d9b0aaee3a9af7c4d3da073f7c2ba2d835537356..aefe084e238417d2d86aeb14aa0fce9bb4684280 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/AudioMAE.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc index 6e498fa70caf550841eacaf6df3dd52b1bb6dd8d..9e5b004d40df384d42a9a1998ae0047b250924dd 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc index 156772b039bc7cfa9b9538fc0ca2b2fe9775af2f..2274737dcf2e4fa62239b21baad3732622150869 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_mae.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc index 0525dde7420ef0fd0ab0485041272fe4d6529f43..d5c794198a8a35519fbc3e6f26dddddad370121d 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/__pycache__/models_vit.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc index 11a0e39e7100cb0fb7a1195d08cab66d4c16bf34..069d3b0569312ce1ce75be74152f86cb1bdeb795 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc index 7a3adf280f5e9d6295b4417cde18dbde348d9b94..ab5539f7bdba4e8c890a4beb6216b4b18aaea4c7 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/model.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc index b9642f32a570e9574eaca5b08457fdf5076ac490..34921e4de1c826181496b6d446c0ec4402b9acf2 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/sequence_gen/__pycache__/sequence_input.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc index a79d1452d19ec36eab0a727736962f6875a1168a..b12ecfda9cdc6a2d655241df219c7243f61a7a63 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/patch_embed.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc b/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc index 3e424e9374743c7521a6a41aa11d69bf3952be9b..63c438b354c8670bc584c24606665e309d4fbeab 100644 Binary files a/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/audiomae/util/__pycache__/pos_embed.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc index 6ce15db9ac6078e2450dfc8e284cc499e5e4fabd..594ba6d190b602fc83b91845b136dafe91d7f75f 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc index 4af4d2e11e37923a70e8970b27ac4147220aca6b..3b355ecd7b77711043a1dc3e986cd361ac225fa6 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/factory.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/factory.cpython-310.pyc index f20f1762fa111b36ed2ee95a29d91e86ff0b63a7..6962979e2b163e9d11a63a6649e1ce8e8cb828f6 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/factory.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/factory.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/feature_fusion.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/feature_fusion.cpython-310.pyc index f1f429dfbe292da5246a7b54742dcf0a6daece66..2a9b8c9c22a8456119bd663cf9aa6b31f9737c8a 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/feature_fusion.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/feature_fusion.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/htsat.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/htsat.cpython-310.pyc index 998c43e4f049c7dcf43cc0cfa779bf09ffdfe0f2..333927083332cc976a017c19788dedfbe8ce28ee 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/htsat.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/htsat.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/loss.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/loss.cpython-310.pyc index 4e4335954faa0b94dc6cae28c86ef50177dfee39..20b10c0ab946a676e1fb8aaff43c608dcaf5b1dc 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/loss.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/loss.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/model.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/model.cpython-310.pyc index 153564d5c39b30f0c83a6c74e3b9ce1b8ae6f2a2..5140aafb1e546563890e2540cb7fe3209534a89b 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/model.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/model.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/openai.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/openai.cpython-310.pyc index 193ade09157cc5c8e846c16433cfcf046a651257..a2676ed03d30c8bb62a3763e9048eb7a126f4d58 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/openai.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/openai.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pann_model.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pann_model.cpython-310.pyc index a5e8465a30943f3927266c81f7da766efd39d5d8..52df96176ebd267f83089dc2c2884dbc2e01dc37 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pann_model.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pann_model.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pretrained.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pretrained.cpython-310.pyc index cf08c9bf6fe0e4483cbefa02628f934c813a3653..1659ac92f080e1c3ee84f4b07ac6cb00f587816e 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pretrained.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/pretrained.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/timm_model.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/timm_model.cpython-310.pyc index 8c06fa3d2153d2dd5f6d07ab9d0d05ff11497fc0..de707acd5432186d69c9b1cc6a674e207a540f80 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/timm_model.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/timm_model.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/tokenizer.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/tokenizer.cpython-310.pyc index 31ecca34d6550e57e48bef091a8361e0c6c0346c..b92a4387283921afb9b4bc9667f71d5128fca82a 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/tokenizer.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/tokenizer.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/transform.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/transform.cpython-310.pyc index eb29f3b57b17bea42078003fd83d4ecc3ae459fa..1c67b7b2bb89e04275e896f7117bcb1f6058031a 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/transform.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/transform.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/utils.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/utils.cpython-310.pyc index a338f120c6d3827f4209da224e9750c9ca4558aa..75a0e9572dd0eaff0139e49bc78ed1ced4c6c9c3 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/utils.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/open_clip/__pycache__/utils.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/__init__.cpython-310.pyc index eb3cf8aef7f3ed6d9195011231eaaf46de4ffdaf..dc2fdc418b25aed896f0f7677aec0f51da7c25c4 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/data.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/data.cpython-310.pyc index 4960916871130811fbed19d913c5132c60a9700f..bb87db53cc1ad9da1aeab5f227bc62fd77025b3c 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/data.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/data.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/params.cpython-310.pyc b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/params.cpython-310.pyc index 808b504ed4842c517aa54b135411c561a6a253c3..1505a22727a7b46829e3f86c268eb90bc5857a00 100644 Binary files a/qa_mdt/audioldm_train/modules/clap/training/__pycache__/params.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/clap/training/__pycache__/params.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt.cpython-310.pyc index 87088ed0906ce005870deff18185dc3ce166cdc5..fe68b4eab041a7a2afb2635eda7dff97f25cbb93 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt_blocks.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt_blocks.cpython-310.pyc index 31c5077345eb8e7e6a6be1df6b5ea8b37b68ec54..da65a3530720e79d0ef68f2b6baa7e419caef82e 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt_blocks.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/PixArt_blocks.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc index 7a14fb13a2781f1af4238b7a00f8934b53030f05..b1c97313aab548f4d39d6eaca8d6b91714c0a3b3 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/attention.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/attention.cpython-310.pyc index 3d3e9143b7b10572ada8457fa2b542e99d45abb8..9108bfcd77e80afb1ad47af3c26301cd458309bb 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/attention.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/attention.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/distributions.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/distributions.cpython-310.pyc index c850006ea6798f724a9a47652f6b86c0938a1ec9..9795d88c2c9f974a261efcb90322c93e8fb19fc0 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/distributions.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/distributions.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/ema.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/ema.cpython-310.pyc index b14ceb8d176b17356f281c43925ba09b87f9a377..5e2d8104015c59f0e1cd1293ab794db61a66bcc6 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/ema.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/ema.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/model.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/model.cpython-310.pyc index 3e95dcd5f6d13f2e4bd136faa8b4dc0dde2d3751..59dcfbebf2268e051d2b0910cbd128efe1a01024 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/model.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/model.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/utils.cpython-310.pyc b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/utils.cpython-310.pyc index 5a13dc54209148099b09d0a265e05082ced30274..734313d6c84c57d79bbc22cf8a541c28466e35fe 100644 Binary files a/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/utils.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/diffusionmodules/__pycache__/utils.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/__init__.cpython-310.pyc index 64715afed537f34222684f5a10d014e268750c93..ba42d96dc6737ae62cdc7121f8b003f8ab3a2a9a 100644 Binary files a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models.cpython-310.pyc b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models.cpython-310.pyc index fa5d792aa66126ecbe39338012f577246287a8f2..4eb4e5d6b0f6240e2f0e6ed1f7529e52118a6fe6 100644 Binary files a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models_hifires.cpython-310.pyc b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models_hifires.cpython-310.pyc index a68aad26fb8d897a3d76ce07c118d8b8d8327d9a..9c703b96e19be433b9dc98445010f65914b478a1 100644 Binary files a/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models_hifires.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/hifigan/__pycache__/models_hifires.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/__init__.cpython-310.pyc index 48bde932889f0e62a098a9461acf6f7e5bbfdc3b..5bc7611d3c9a0cdae49fc53c98b4f7615da8a8b8 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddim.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddim.cpython-310.pyc index d17d44f0e6647c135f0c1a8248ddba0fa56615df..19b4f358e44d614445bd0b6fbd31624957baef57 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddim.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddim.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddpm.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddpm.cpython-310.pyc index e43d5bf13ac9105f20999148b37d71d10e0e7601..4f3191b0864f74d75803a88824c8fe708080fa46 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddpm.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/ddpm.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/plms.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/plms.cpython-310.pyc index 2831483b0bca206eb231860bd6802aaf088fbf02..e1a9a3917b50aabb046d7ebff49cdbfc031a5ed5 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/plms.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_diffusion/__pycache__/plms.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/__init__.cpython-310.pyc index 1aee62a07126d07ae85185ea2408543b0fb9bbfb..f9bd87db907df28b5871ba63748f9b5e47c832da 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/autoencoder.cpython-310.pyc b/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/autoencoder.cpython-310.pyc index c7b983ab6106b8157251358255a73146d9bf2521..a4b9ed02a871f898c9563d30e7e8749eac98af52 100644 Binary files a/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/autoencoder.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/latent_encoder/__pycache__/autoencoder.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/__init__.cpython-310.pyc index 244cdedc3e3f1a652631b0561118ccdeabcca109..e4f233d5695aa218b417ba1604abf1f580e01b11 100644 Binary files a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/attentions.cpython-310.pyc b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/attentions.cpython-310.pyc index 2bcf522a98278855471e3b058f19b470d1892f39..c65b61331a4f2a7e76bf3da8845a0568c611fbb3 100644 Binary files a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/attentions.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/attentions.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/commons.cpython-310.pyc b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/commons.cpython-310.pyc index 359028b5c3c61d5a51af286b971c854648432f44..ea0d20a41ee14c3bbde93b053ab393be17822f64 100644 Binary files a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/commons.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/commons.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/encoder.cpython-310.pyc b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/encoder.cpython-310.pyc index 8baf833e872f99cef613d65009e98dccefc51f73..cce4178e194a835f90a3182a27dce7485e787427 100644 Binary files a/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/encoder.cpython-310.pyc and b/qa_mdt/audioldm_train/modules/phoneme_encoder/__pycache__/encoder.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/__pycache__/__init__.cpython-310.pyc index 7b037b40dfa0c2baa738d45ccb42fd2c90a353ff..9cb3b0705499671cc49013332ce30c3964fffff5 100644 Binary files a/qa_mdt/audioldm_train/utilities/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/__pycache__/diffusion_util.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/__pycache__/diffusion_util.cpython-310.pyc index 6fd1a80b208236e631c32ae6372ed1b8f7737933..74c2f803a9c543241f875efccc81cb2b7e3c0fec 100644 Binary files a/qa_mdt/audioldm_train/utilities/__pycache__/diffusion_util.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/__pycache__/diffusion_util.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/__pycache__/model_util.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/__pycache__/model_util.cpython-310.pyc index 6e5e7e3ba29df1fb0ec21b7acf0d1ef03240cc35..822a4c4d5307a1448e1b2d453333c997982fa9ae 100644 Binary files a/qa_mdt/audioldm_train/utilities/__pycache__/model_util.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/__pycache__/model_util.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/__pycache__/tools.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/__pycache__/tools.cpython-310.pyc index 293c4fad07c180ac71ab50dee2a0fec354894c68..2751094d5a54abe2074634d5f545b5640d806fa9 100644 Binary files a/qa_mdt/audioldm_train/utilities/__pycache__/tools.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/__pycache__/tools.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/audio/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/audio/__pycache__/__init__.cpython-310.pyc index f1cbdeae80632982d98b13d7d5d79cdfaeee09f6..3fa2c0382fac2e7c9e3dbe828f993c9c218083bd 100644 Binary files a/qa_mdt/audioldm_train/utilities/audio/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/audio/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/audio/__pycache__/audio_processing.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/audio/__pycache__/audio_processing.cpython-310.pyc index f61af9afd7fbc21fe548ce0f9a6d69a1537f9125..4bd1d4ba2915c0e9c9810266481c69e6bf64577a 100644 Binary files a/qa_mdt/audioldm_train/utilities/audio/__pycache__/audio_processing.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/audio/__pycache__/audio_processing.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/audio/__pycache__/stft.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/audio/__pycache__/stft.cpython-310.pyc index ed709bd478dd32a9c78eb81c3f59e34d72db857d..a676b1a9a4c5a46eb9e82d524ff2750765309538 100644 Binary files a/qa_mdt/audioldm_train/utilities/audio/__pycache__/stft.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/audio/__pycache__/stft.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/audio/__pycache__/tools.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/audio/__pycache__/tools.cpython-310.pyc index 75107b27d52b64a5ad65db40c89021d3e012ddb5..00ac082af47e99c74451035064859125add2fd5c 100644 Binary files a/qa_mdt/audioldm_train/utilities/audio/__pycache__/tools.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/audio/__pycache__/tools.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/data/__pycache__/__init__.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/data/__pycache__/__init__.cpython-310.pyc index f239c760bb9c80e40b84532f1bc55eda931150e3..b61a24306dc6547b206fd48bbfc63ce51dfae3ea 100644 Binary files a/qa_mdt/audioldm_train/utilities/data/__pycache__/__init__.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/data/__pycache__/__init__.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset.cpython-310.pyc index c1f78f9757f6cce6ca5386e9f0bc0d8e992dd51b..05874af9676b6aca686e35459928082f4669a8cd 100644 Binary files a/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset.cpython-310.pyc differ diff --git a/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset_original_mos5.cpython-310.pyc b/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset_original_mos5.cpython-310.pyc index d7e0a233cf8c50fa4746f259de49891ec0ef19e0..a68dc791527e3fb0bbfa51309f83c8d671ef8c59 100644 Binary files a/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset_original_mos5.cpython-310.pyc and b/qa_mdt/audioldm_train/utilities/data/__pycache__/dataset_original_mos5.cpython-310.pyc differ diff --git a/qa_mdt/infer/__pycache__/infer_mos5.cpython-310.pyc b/qa_mdt/infer/__pycache__/infer_mos5.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76bcd58226c974dbeeb7bc5c33c09d59c00a7ac8 Binary files /dev/null and b/qa_mdt/infer/__pycache__/infer_mos5.cpython-310.pyc differ diff --git a/taming/modules/autoencoder/lpips/vgg.pth b/taming/modules/autoencoder/lpips/vgg.pth new file mode 100644 index 0000000000000000000000000000000000000000..f57dcf5cc764d61c8a460365847fb2137ff0a62d --- /dev/null +++ b/taming/modules/autoencoder/lpips/vgg.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78928a0af1e5f0fcb1f3b9e8f8c3a2a5a3de244d830ad5c1feddc79b8432868 +size 7289