Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 16, 2024

Commit

8c1bf05

verified ·

1 Parent(s): 8f85e3b

Upload 167 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

audioldm/__init__.py +8 -0
audioldm/__main__.py +183 -0
audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/__pycache__/__init__.cpython-37.pyc +0 -0
audioldm/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
audioldm/__pycache__/ldm.cpython-37.pyc +0 -0
audioldm/__pycache__/ldm.cpython-39.pyc +0 -0
audioldm/__pycache__/pipeline.cpython-310.pyc +0 -0
audioldm/__pycache__/pipeline.cpython-37.pyc +0 -0
audioldm/__pycache__/pipeline.cpython-39.pyc +0 -0
audioldm/__pycache__/utils.cpython-310.pyc +0 -0
audioldm/__pycache__/utils.cpython-37.pyc +0 -0
audioldm/__pycache__/utils.cpython-39.pyc +0 -0
audioldm/audio/__init__.py +2 -0
audioldm/audio/__pycache__/__init__.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/__init__.cpython-37.pyc +0 -0
audioldm/audio/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/audio_processing.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/audio_processing.cpython-37.pyc +0 -0
audioldm/audio/__pycache__/audio_processing.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/mix.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/stft.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/stft.cpython-37.pyc +0 -0
audioldm/audio/__pycache__/stft.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/tools.cpython-310.pyc +0 -0
audioldm/audio/__pycache__/tools.cpython-37.pyc +0 -0
audioldm/audio/__pycache__/tools.cpython-39.pyc +0 -0
audioldm/audio/__pycache__/torch_tools.cpython-39.pyc +0 -0
audioldm/audio/audio_processing.py +100 -0
audioldm/audio/stft.py +186 -0
audioldm/audio/tools.py +85 -0
audioldm/clap/__init__.py +0 -0
audioldm/clap/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/clap/__pycache__/encoders.cpython-39.pyc +0 -0
audioldm/clap/encoders.py +170 -0
audioldm/clap/open_clip/__init__.py +25 -0
audioldm/clap/open_clip/__pycache__/__init__.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/factory.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/feature_fusion.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/htsat.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/loss.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/model.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/openai.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/pann_model.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/pretrained.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/timm_model.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/tokenizer.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/transform.cpython-39.pyc +0 -0
audioldm/clap/open_clip/__pycache__/utils.cpython-39.pyc +0 -0

audioldm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .ldm import LatentDiffusion
+from .utils import seed_everything, save_wave, get_time, get_duration
+from .pipeline import *

audioldm/__main__.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/python3
+import os
+from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
+import argparse
+CACHE_DIR = os.getenv(
+    "AUDIOLDM_CACHE_DIR",
+    os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--mode",
+    type=str,
+    required=False,
+    default="generation",
+    help="generation: text-to-audio generation; transfer: style transfer",
+    choices=["generation", "transfer"]
+)
+parser.add_argument(
+    "-t",
+    "--text",
+    type=str,
+    required=False,
+    default="",
+    help="Text prompt to the model for audio generation",
+)
+parser.add_argument(
+    "-f",
+    "--file_path",
+    type=str,
+    required=False,
+    default=None,
+    help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
+)
+parser.add_argument(
+    "--transfer_strength",
+    type=float,
+    required=False,
+    default=0.5,
+    help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
+)
+parser.add_argument(
+    "-s",
+    "--save_path",
+    type=str,
+    required=False,
+    help="The path to save model output",
+    default="./output",
+)
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=False,
+    help="The checkpoint you gonna use",
+    default="audioldm-s-full",
+    choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
+)
+parser.add_argument(
+    "-ckpt",
+    "--ckpt_path",
+    type=str,
+    required=False,
+    help="The path to the pretrained .ckpt model",
+    default=None,
+)
+parser.add_argument(
+    "-b",
+    "--batchsize",
+    type=int,
+    required=False,
+    default=1,
+    help="Generate how many samples at the same time",
+)
+parser.add_argument(
+    "--ddim_steps",
+    type=int,
+    required=False,
+    default=200,
+    help="The sampling step for DDIM",
+)
+parser.add_argument(
+    "-gs",
+    "--guidance_scale",
+    type=float,
+    required=False,
+    default=2.5,
+    help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
+)
+parser.add_argument(
+    "-dur",
+    "--duration",
+    type=float,
+    required=False,
+    default=10.0,
+    help="The duration of the samples",
+)
+parser.add_argument(
+    "-n",
+    "--n_candidate_gen_per_text",
+    type=int,
+    required=False,
+    default=3,
+    help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    required=False,
+    default=42,
+    help="Change this value (any integer number) will lead to a different generation result.",
+)
+args = parser.parse_args()
+if(args.ckpt_path is not None):
+    print("Warning: ckpt_path has no effect after version 0.0.20.")
+assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
+mode = args.mode
+if(mode == "generation" and args.file_path is not None):
+    mode = "generation_audio_to_audio"
+    if(len(args.text) > 0):
+        print("Warning: You have specified the --file_path. --text will be ignored")
+        args.text = ""
+save_path = os.path.join(args.save_path, mode)
+if(args.file_path is not None):
+    save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
+text = args.text
+random_seed = args.seed
+duration = args.duration
+guidance_scale = args.guidance_scale
+n_candidate_gen_per_text = args.n_candidate_gen_per_text
+os.makedirs(save_path, exist_ok=True)
+audioldm = build_model(model_name=args.model_name)
+if(args.mode == "generation"):
+    waveform = text_to_audio(
+        audioldm,
+        text,
+        args.file_path,
+        random_seed,
+        duration=duration,
+        guidance_scale=guidance_scale,
+        ddim_steps=args.ddim_steps,
+        n_candidate_gen_per_text=n_candidate_gen_per_text,
+        batchsize=args.batchsize,
+    )
+elif(args.mode == "transfer"):
+    assert args.file_path is not None
+    assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
+    waveform = style_transfer(
+        audioldm,
+        text,
+        args.file_path,
+        args.transfer_strength,
+        random_seed,
+        duration=duration,
+        guidance_scale=guidance_scale,
+        ddim_steps=args.ddim_steps,
+        batchsize=args.batchsize,
+    )
+    waveform = waveform[:,None,:]
+save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))

audioldm/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (376 Bytes). View file

audioldm/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (290 Bytes). View file

audioldm/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (322 Bytes). View file

audioldm/__pycache__/ldm.cpython-310.pyc ADDED Viewed

Binary file (16.1 kB). View file

audioldm/__pycache__/ldm.cpython-37.pyc ADDED Viewed

Binary file (16 kB). View file

audioldm/__pycache__/ldm.cpython-39.pyc ADDED Viewed

Binary file (16 kB). View file

audioldm/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (6.69 kB). View file

audioldm/__pycache__/pipeline.cpython-37.pyc ADDED Viewed

Binary file (6.41 kB). View file

audioldm/__pycache__/pipeline.cpython-39.pyc ADDED Viewed

Binary file (6.54 kB). View file

audioldm/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (8.07 kB). View file

audioldm/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (7.65 kB). View file

audioldm/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (7.35 kB). View file

audioldm/audio/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .tools import wav_to_fbank, read_wav_file
2	+ from .stft import TacotronSTFT

audioldm/audio/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (314 Bytes). View file

audioldm/audio/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (228 Bytes). View file

audioldm/audio/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (260 Bytes). View file

audioldm/audio/__pycache__/audio_processing.cpython-310.pyc ADDED Viewed

Binary file (2.84 kB). View file

audioldm/audio/__pycache__/audio_processing.cpython-37.pyc ADDED Viewed

Binary file (2.74 kB). View file

audioldm/audio/__pycache__/audio_processing.cpython-39.pyc ADDED Viewed

Binary file (2.78 kB). View file

audioldm/audio/__pycache__/mix.cpython-39.pyc ADDED Viewed

Binary file (1.7 kB). View file

audioldm/audio/__pycache__/stft.cpython-310.pyc ADDED Viewed

Binary file (5.08 kB). View file

audioldm/audio/__pycache__/stft.cpython-37.pyc ADDED Viewed

Binary file (4.97 kB). View file

audioldm/audio/__pycache__/stft.cpython-39.pyc ADDED Viewed

Binary file (4.99 kB). View file

audioldm/audio/__pycache__/tools.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

audioldm/audio/__pycache__/tools.cpython-37.pyc ADDED Viewed

Binary file (2.16 kB). View file

audioldm/audio/__pycache__/tools.cpython-39.pyc ADDED Viewed

Binary file (2.19 kB). View file

audioldm/audio/__pycache__/torch_tools.cpython-39.pyc ADDED Viewed

Binary file (3.79 kB). View file

audioldm/audio/audio_processing.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return normalize_fun(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C

audioldm/audio/stft.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+from audioldm.audio.audio_processing import (
+    dynamic_range_compression,
+    dynamic_range_decompression,
+    window_sumsquare,
+)
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size=filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        device = self.forward_basis.device
+        input_data = input_data.to(device)
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data,
+            torch.autograd.Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )#.cpu()
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        device = self.forward_basis.device
+        magnitude, phase = magnitude.to(device), phase.to(device)
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length,
+        hop_length,
+        win_length,
+        n_mel_channels,
+        sampling_rate,
+        mel_fmin,
+        mel_fmax,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes, normalize_fun):
+        output = dynamic_range_compression(magnitudes, normalize_fun)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y, normalize_fun=torch.log):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1, torch.min(y.data)
+        assert torch.max(y.data) <= 1, torch.max(y.data)
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output, normalize_fun)
+        energy = torch.norm(magnitudes, dim=1)
+        log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
+        return mel_output, log_magnitudes, energy

audioldm/audio/tools.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import numpy as np
+import torchaudio
+def get_mel_from_wav(audio, _stft):
+    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
+    melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
+    log_magnitudes_stft = (
+        torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
+    )
+    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
+    return melspec, log_magnitudes_stft, energy
+def _pad_spec(fbank, target_length=1024):
+    n_frames = fbank.shape[0]
+    p = target_length - n_frames
+    # cut and pad
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[0:target_length, :]
+    if fbank.size(-1) % 2 != 0:
+        fbank = fbank[..., :-1]
+    return fbank
+def pad_wav(waveform, segment_length):
+    waveform_length = waveform.shape[-1]
+    assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    elif waveform_length < segment_length:
+        temp_wav = np.zeros((1, segment_length))
+        temp_wav[:, :waveform_length] = waveform
+    return temp_wav
+def normalize_wav(waveform):
+    waveform = waveform - np.mean(waveform)
+    waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+def read_wav_file(filename, segment_length):
+    # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
+    waveform, sr = torchaudio.load(filename)  # Faster!!!
+    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
+    waveform = waveform.numpy()[0, ...]
+    waveform = normalize_wav(waveform)
+    waveform = waveform[None, ...]
+    waveform = pad_wav(waveform, segment_length)
+    waveform = waveform / np.max(np.abs(waveform))
+    waveform = 0.5 * waveform
+    return waveform
+def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
+    assert fn_STFT is not None
+    # mixup
+    waveform = read_wav_file(filename, target_length * 160)  # hop size is 160
+    waveform = waveform[0, ...]
+    waveform = torch.FloatTensor(waveform)
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = torch.FloatTensor(fbank.T)
+    log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
+    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+        log_magnitudes_stft, target_length
+    )
+    return fbank, log_magnitudes_stft, waveform

audioldm/clap/__init__.py ADDED Viewed

File without changes

audioldm/clap/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (152 Bytes). View file

audioldm/clap/__pycache__/encoders.cpython-39.pyc ADDED Viewed

Binary file (5.1 kB). View file

audioldm/clap/encoders.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import torch
+import torch.nn as nn
+from audioldm.clap.open_clip import create_model
+from audioldm.clap.training.data import get_audio_features
+import torchaudio
+from transformers import RobertaTokenizer
+import torch.nn.functional as F
+class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
+    def __init__(
+        self,
+        pretrained_path="",
+        key="class",
+        sampling_rate=16000,
+        embed_mode="audio",
+        amodel = "HTSAT-tiny",
+        unconditional_prob=0.1,
+        random_mute=False,
+        max_random_mute_portion=0.5,
+        training_mode=True,
+    ):
+        super().__init__()
+        self.key = key
+        self.device = "cpu"
+        self.precision = "fp32"
+        self.amodel = amodel  # or 'PANN-14'
+        self.tmodel = "roberta"  # the best text encoder in our training
+        self.enable_fusion = False  # False if you do not want to use the fusion model
+        self.fusion_type = "aff_2d"
+        self.pretrained = pretrained_path
+        self.embed_mode = embed_mode
+        self.embed_mode_orig = embed_mode
+        self.sampling_rate = sampling_rate
+        self.unconditional_prob = unconditional_prob
+        self.random_mute = random_mute
+        self.tokenize = RobertaTokenizer.from_pretrained("roberta-base")
+        self.max_random_mute_portion = max_random_mute_portion
+        self.training_mode = training_mode
+        self.model, self.model_cfg = create_model(
+            self.amodel,
+            self.tmodel,
+            self.pretrained,
+            precision=self.precision,
+            device=self.device,
+            enable_fusion=self.enable_fusion,
+            fusion_type=self.fusion_type,
+        )
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.model.eval()
+    def get_unconditional_condition(self, batchsize):
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
+    def batch_to_list(self, batch):
+        ret = []
+        for i in range(batch.size(0)):
+            ret.append(batch[i])
+        return ret
+    def make_decision(self, probability):
+        if float(torch.rand(1)) < probability:
+            return True
+        else:
+            return False
+    def random_uniform(self, start, end):
+        val = torch.rand(1).item()
+        return start + (end - start) * val
+    def _random_mute(self, waveform):
+        # waveform: [bs, t-steps]
+        t_steps = waveform.size(-1)
+        for i in range(waveform.size(0)):
+            mute_size = int(
+                self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
+            )
+            mute_start = int(self.random_uniform(0, t_steps - mute_size))
+            waveform[i, mute_start : mute_start + mute_size] = 0
+        return waveform
+    def cos_similarity(self, waveform, text):
+        # waveform: [bs, t_steps]
+        with torch.no_grad():
+            self.embed_mode = "audio"
+            audio_emb = self(waveform.cuda())
+            self.embed_mode = "text"
+            text_emb = self(text)
+            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2), audio_emb, text_emb
+            return similarity.squeeze()
+    def forward(self, batch, key=None):
+        # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
+        # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
+        if self.model.training == True and not self.training_mode:
+            print(
+                "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
+            )
+            self.model, self.model_cfg = create_model(
+                self.amodel,
+                self.tmodel,
+                self.pretrained,
+                precision=self.precision,
+                device="cuda",
+                enable_fusion=self.enable_fusion,
+                fusion_type=self.fusion_type,
+            )
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.model.eval()
+        # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+        if self.embed_mode == "audio":
+            with torch.no_grad():
+                audio_dict_list = []
+                assert (
+                    self.sampling_rate == 16000
+                ), "We only support 16000 sampling rate"
+                if self.random_mute:
+                    batch = self._random_mute(batch)
+                # batch: [bs, 1, t-samples]
+                batch = torchaudio.functional.resample(
+                    batch, orig_freq=self.sampling_rate, new_freq=48000
+                )
+                for waveform in self.batch_to_list(batch):
+                    audio_dict = {}
+                    audio_dict = get_audio_features(
+                        audio_dict,
+                        waveform,
+                        480000,
+                        data_truncating="fusion",
+                        data_filling="repeatpad",
+                        audio_cfg=self.model_cfg["audio_cfg"],
+                    )
+                    audio_dict_list.append(audio_dict)
+                # [bs, 512]
+                embed = self.model.get_audio_embedding(audio_dict_list)
+        elif self.embed_mode == "text":
+            with torch.no_grad():
+                # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+                text_data = self.tokenizer(batch)
+                embed = self.model.get_text_embedding(text_data)
+        embed = embed.unsqueeze(1)
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        for i in range(embed.size(0)):
+            if self.make_decision(self.unconditional_prob):
+                embed[i] = self.unconditional_token
+        # [bs, 1, 512]
+        return embed.detach()
+    def tokenizer(self, text):
+        result = self.tokenize(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}

audioldm/clap/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .factory import (
+    list_models,
+    create_model,
+    create_model_and_transforms,
+    add_model_config,
+)
+from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
+from .model import (
+    CLAP,
+    CLAPTextCfg,
+    CLAPVisionCfg,
+    CLAPAudioCfp,
+    convert_weights_to_fp16,
+    trace_model,
+)
+from .openai import load_openai_model, list_openai_models
+from .pretrained import (
+    list_pretrained,
+    list_pretrained_tag_models,
+    list_pretrained_model_tags,
+    get_pretrained_url,
+    download_pretrained,
+)
+from .tokenizer import SimpleTokenizer, tokenize
+from .transform import image_transform

audioldm/clap/open_clip/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (958 Bytes). View file

audioldm/clap/open_clip/__pycache__/factory.cpython-39.pyc ADDED Viewed

Binary file (6.67 kB). View file

audioldm/clap/open_clip/__pycache__/feature_fusion.cpython-39.pyc ADDED Viewed

Binary file (4.17 kB). View file

audioldm/clap/open_clip/__pycache__/htsat.cpython-39.pyc ADDED Viewed

Binary file (30.8 kB). View file

audioldm/clap/open_clip/__pycache__/loss.cpython-39.pyc ADDED Viewed

Binary file (8.06 kB). View file

audioldm/clap/open_clip/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (23.8 kB). View file

audioldm/clap/open_clip/__pycache__/openai.cpython-39.pyc ADDED Viewed

Binary file (4.55 kB). View file

audioldm/clap/open_clip/__pycache__/pann_model.cpython-39.pyc ADDED Viewed

Binary file (13.3 kB). View file

audioldm/clap/open_clip/__pycache__/pretrained.cpython-39.pyc ADDED Viewed

Binary file (5.09 kB). View file

audioldm/clap/open_clip/__pycache__/timm_model.cpython-39.pyc ADDED Viewed

Binary file (3.4 kB). View file

audioldm/clap/open_clip/__pycache__/tokenizer.cpython-39.pyc ADDED Viewed

Binary file (7.42 kB). View file

audioldm/clap/open_clip/__pycache__/transform.cpython-39.pyc ADDED Viewed

Binary file (974 Bytes). View file

audioldm/clap/open_clip/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (9.83 kB). View file