Spaces:
Running
on
L4
Running
on
L4
import importlib | |
from inspect import isfunction | |
import os | |
import soundfile as sf | |
def seed_everything(seed): | |
import random, os | |
import numpy as np | |
import torch | |
random.seed(seed) | |
os.environ['PYTHONHASHSEED'] = str(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = True | |
def save_wave(waveform, savepath, name="outwav"): | |
if type(name) is not list: | |
name = [name] * waveform.shape[0] | |
for i in range(waveform.shape[0]): | |
path = os.path.join( | |
savepath, | |
"%s_%s.wav" | |
% ( | |
os.path.basename(name[i]) | |
if (not ".wav" in name[i]) | |
else os.path.basename(name[i]).split(".")[0], | |
i, | |
), | |
) | |
sf.write(path, waveform[i, 0], samplerate=16000) | |
def exists(x): | |
return x is not None | |
def default(val, d): | |
if exists(val): | |
return val | |
return d() if isfunction(d) else d | |
def count_params(model, verbose=False): | |
total_params = sum(p.numel() for p in model.parameters()) | |
if verbose: | |
print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") | |
return total_params | |
def get_obj_from_str(string, reload=False): | |
module, cls = string.rsplit(".", 1) | |
if reload: | |
module_imp = importlib.import_module(module) | |
importlib.reload(module_imp) | |
return getattr(importlib.import_module(module, package=None), cls) | |
def instantiate_from_config(config): | |
if not "target" in config: | |
if config == "__is_first_stage__": | |
return None | |
elif config == "__is_unconditional__": | |
return None | |
raise KeyError("Expected key `target` to instantiate.") | |
return get_obj_from_str(config["target"])(**config.get("params", dict())) | |
def default_audioldm_config(model_name="audioldm-s-full"): | |
basic_config = { | |
"wave_file_save_path": "./output", | |
"id": { | |
"version": "v1", | |
"name": "default", | |
"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml", | |
}, | |
"preprocessing": { | |
"audio": {"sampling_rate": 16000, "max_wav_value": 32768}, | |
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024}, | |
"mel": { | |
"n_mel_channels": 64, | |
"mel_fmin": 0, | |
"mel_fmax": 8000, | |
"freqm": 0, | |
"timem": 0, | |
"blur": False, | |
"mean": -4.63, | |
"std": 2.74, | |
"target_length": 1024, | |
}, | |
}, | |
"model": { | |
"device": "cuda", | |
"target": "audioldm.pipline.LatentDiffusion", | |
"params": { | |
"base_learning_rate": 5e-06, | |
"linear_start": 0.0015, | |
"linear_end": 0.0195, | |
"num_timesteps_cond": 1, | |
"log_every_t": 200, | |
"timesteps": 1000, | |
"first_stage_key": "fbank", | |
"cond_stage_key": "waveform", | |
"latent_t_size": 256, | |
"latent_f_size": 16, | |
"channels": 8, | |
"cond_stage_trainable": True, | |
"conditioning_key": "film", | |
"monitor": "val/loss_simple_ema", | |
"scale_by_std": True, | |
"unet_config": { | |
"target": "audioldm.latent_diffusion.openaimodel.UNetModel", | |
"params": { | |
"image_size": 64, | |
"extra_film_condition_dim": 512, | |
"extra_film_use_concat": True, | |
"in_channels": 8, | |
"out_channels": 8, | |
"model_channels": 128, | |
"attention_resolutions": [8, 4, 2], | |
"num_res_blocks": 2, | |
"channel_mult": [1, 2, 3, 5], | |
"num_head_channels": 32, | |
"use_spatial_transformer": True, | |
}, | |
}, | |
"first_stage_config": { | |
"base_learning_rate": 4.5e-05, | |
"target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL", | |
"params": { | |
"monitor": "val/rec_loss", | |
"image_key": "fbank", | |
"subband": 1, | |
"embed_dim": 8, | |
"time_shuffle": 1, | |
"ddconfig": { | |
"double_z": True, | |
"z_channels": 8, | |
"resolution": 256, | |
"downsample_time": False, | |
"in_channels": 1, | |
"out_ch": 1, | |
"ch": 128, | |
"ch_mult": [1, 2, 4], | |
"num_res_blocks": 2, | |
"attn_resolutions": [], | |
"dropout": 0.0, | |
}, | |
}, | |
}, | |
"cond_stage_config": { | |
"target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2", | |
"params": { | |
"key": "waveform", | |
"sampling_rate": 16000, | |
"embed_mode": "audio", | |
"unconditional_prob": 0.1, | |
}, | |
}, | |
}, | |
}, | |
} | |
if("-l-" in model_name): | |
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256 | |
basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64 | |
elif("-m-" in model_name): | |
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192 | |
basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST | |
return basic_config |