""" |
Audio processing functions to extract feature from a raw audio. Should all be in numpy to support all frameworks, and |
remmove unecessary dependencies. |
""" |
import math |
import warnings |
from typing import Optional |
import numpy as np |
from numpy.fft import fft |
def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float: |
"""Convert Hertz to Mels. |
Args: |
freqs (`float`): |
Frequencies in Hertz |
mel_scale (`str`, *optional*, defaults to `"htk"`): |
Scale to use, `htk` or `slaney`. |
Returns: |
mels (`float`): |
Frequency in Mels |
""" |
if mel_scale not in ["slaney", "htk"]: |
raise ValueError('mel_scale should be one of "htk" or "slaney".') |
if mel_scale == "htk": |
return 2595.0 * math.log10(1.0 + (freq / 700.0)) |
frequency_min = 0.0 |
f_sp = 200.0 / 3 |
mels = (freq - frequency_min) / f_sp |
min_log_hertz = 1000.0 |
min_log_mel = (min_log_hertz - frequency_min) / f_sp |
logstep = math.log(6.4) / 27.0 |
if freq >= min_log_hertz: |
mels = min_log_mel + math.log(freq / min_log_hertz) / logstep |
return mels |
def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array: |
"""Convert mel bin numbers to frequencies. |
Args: |
mels (`np.array`): |
Mel frequencies |
mel_scale (`str`, *optional*, `"htk"`): |
Scale to use: `htk` or `slaney`. |
Returns: |
freqs (`np.array`): |
Mels converted to Hertz |
""" |
if mel_scale not in ["slaney", "htk"]: |
raise ValueError('mel_scale should be one of "htk" or "slaney".') |
if mel_scale == "htk": |
return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) |
frequency_min = 0.0 |
f_sp = 200.0 / 3 |
freqs = frequency_min + f_sp * mels |
min_log_hertz = 1000.0 |
min_log_mel = (min_log_hertz - frequency_min) / f_sp |
logstep = math.log(6.4) / 27.0 |
log_t = mels >= min_log_mel |
freqs[log_t] = min_log_hertz * np.exp(logstep * (mels[log_t] - min_log_mel)) |
return freqs |
def _create_triangular_filterbank( |
all_freqs: np.array, |
f_pts: np.array, |
) -> np.array: |
"""Create a triangular filter bank. |
Args: |
all_freqs (`np.array` of shape (`nb_frequency_bins`, )): |
Discrete frequencies used when the STFT was computed. |
f_pts (`np.array`, of shape (`nb_mel_filters`, )): |
Coordinates of the middle points of the triangular filters to create. |
Returns: |
fb (np.array): |
The filter bank of size (`nb_frequency_bins`, `nb_mel_filters`). |
""" |
f_diff = f_pts[1:] - f_pts[:-1] |
slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) |
zero = np.zeros(1) |
down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] |
up_slopes = slopes[:, 2:] / f_diff[1:] |
fb = np.maximum(zero, np.minimum(down_slopes, up_slopes)) |
return fb |
def get_mel_filter_banks( |
nb_frequency_bins: int, |
nb_mel_filters: int, |
frequency_min: float, |
frequency_max: float, |
sample_rate: int, |
norm: Optional[str] = None, |
mel_scale: str = "htk", |
) -> np.array: |
""" |
Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter bank*, |
and various implementation exist, which differ in the number of filters, the shape of the filters, the way the |
filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these |
features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency. |
This code is heavily inspired from the *torchaudio* implementation, see |
[here](https://pytorch.org/audio/stable/transforms.html) for more details. |
Tips: |
- Different banks of Mel filters were introduced in the litterature. The following variation are supported: |
- MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz |
and a speech bandwidth of `[0, 4600]` Hertz |
- MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a |
speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz). |
- MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate |
of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization. |
- HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling |
rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz |
- The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa` |
uses the `"slaney"` implementation. |
Args: |
nb_frequency_bins (`int`): |
Number of frequencies used to compute the spectrogram (should be the same as in `stft`). |
nb_mel_filters (`int`): |
Number of Mel filers to generate. |
frequency_min (`float`): |
Minimum frequency of interest(Hertz). |
frequency_max (`float`): |
Maximum frequency of interest(Hertz). |
sample_rate (`int`): |
Sample rate of the audio waveform. |
norm (`str`, *optional*): |
If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization). |
mel_scale (`str`, *optional*, defaults to `"htk"`): |
Scale to use: `"htk"` or `"slaney"`. |
Returns: |
`np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix |
is a projection matrix to go from a spectrogram to a Mel Spectrogram. |
""" |
if norm is not None and norm != "slaney": |
raise ValueError('norm must be one of None or "slaney"') |
all_freqs = np.linspace(0, sample_rate // 2, nb_frequency_bins) |
m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale) |
m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale) |
m_pts = np.linspace(m_min, m_max, nb_mel_filters + 2) |
f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale) |
filterbank = _create_triangular_filterbank(all_freqs, f_pts) |
if norm is not None and norm == "slaney": |
enorm = 2.0 / (f_pts[2 : nb_mel_filters + 2] - f_pts[:nb_mel_filters]) |
filterbank *= np.expand_dims(enorm, 0) |
if (filterbank.max(axis=0) == 0.0).any(): |
warnings.warn( |
"At least one mel filterbank has all zero values. " |
f"The value for `nb_mel_filters` ({nb_mel_filters}) may be set too high. " |
f"Or, the value for `nb_frequency_bins` ({nb_frequency_bins}) may be set too low." |
) |
return filterbank |
def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0): |
""" |
Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb. |
It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability. |
Tips: |
- The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on |
a |
linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into |
it. |
- This means that large variations in energy may not sound all that different if the sound is loud to begin |
with. This compression operation makes the mel features match more closely what humans actually hear. |
Args: |
mel_spectrogram (`np.array`): |
Input mel spectrogram. |
top_db (`int`, *optional*): |
The maximum decibel value. |
a_min (`int`, *optional*, default to 1e-10): |
Minimum value to use when cliping the mel spectrogram. |
ref (`float`, *optional*, default to 1.0): |
Maximum reference value used to scale the mel_spectrogram. |
""" |
log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None)) |
log_spec -= 10.0 * np.log10(np.maximum(a_min, ref)) |
if top_db is not None: |
if top_db < 0: |
raise ValueError("top_db must be non-negative") |
log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf) |
return log_spec |
def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True): |
""" |
In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed |
segments called `frames`. |
The window length (window_length) defines how much of the signal is contained in each frame, while the hop length |
defines the step between the beginning of each new frame. |
Args: |
waveform (`np.array` of shape `(sample_length,)`): |
The raw waveform which will be split into smaller chunks. |
hop_length (`int`, *optional*, defaults to 160): |
Step between each window of the waveform. |
fft_window_size (`int`, *optional*, defaults to 400): |
Defines the size of the window. |
center (`bool`, defaults to `True`): |
Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the |
waveform on the left and on the right. |
Return: |
framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`): |
The framed waveforms that can be fed to `np.fft`. |
""" |
frames = [] |
for i in range(0, waveform.shape[0] + 1, hop_length): |
if center: |
half_window = (fft_window_size - 1) // 2 + 1 |
start = i - half_window if i > half_window else 0 |
end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] |
frame = waveform[start:end] |
if start == 0: |
padd_width = (-i + half_window, 0) |
frame = np.pad(frame, pad_width=padd_width, mode="reflect") |
elif end == waveform.shape[0]: |
padd_width = (0, (i - waveform.shape[0] + half_window)) |
frame = np.pad(frame, pad_width=padd_width, mode="reflect") |
else: |
frame = waveform[i : i + fft_window_size] |
frame_width = frame.shape[0] |
if frame_width < waveform.shape[0]: |
frame = np.lib.pad( |
frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0 |
) |
frames.append(frame) |
frames = np.stack(frames, 0) |
return frames |
def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None): |
""" |
Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results |
as `torch.stft`. |
Args: |
frames (`np.array` of dimension `(num_frames, fft_window_size)`): |
A framed audio signal obtained using `audio_utils.fram_wav`. |
windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`: |
A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the |
boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. |
For more information on the discontinuities, called *Spectral leakage*, refer to [this |
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf |
fft_window_size (`int`, *optional*): |
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the |
spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of |
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to |
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally. |
Example: |
```python |
>>> from transformers.audio_utils import stft, fram_wave |
>>> import numpy as np |
>>> audio = np.random.rand(50) |
>>> fft_window_size = 10 |
>>> hop_length = 2 |
>>> framed_audio = fram_wave(audio, hop_length, fft_window_size) |
>>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1)) |
``` |
Returns: |
spectrogram (`np.ndarray`): |
A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm |
""" |
frame_size = frames.shape[1] |
if fft_window_size is None: |
fft_window_size = frame_size |
if fft_window_size < frame_size: |
raise ValueError("FFT size must greater or equal the frame size") |
nb_frequency_bins = (fft_window_size >> 1) + 1 |
spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64) |
fft_signal = np.zeros(fft_window_size) |
for f, frame in enumerate(frames): |
if windowing_function is not None: |
np.multiply(frame, windowing_function, out=fft_signal[:frame_size]) |
else: |
fft_signal[:frame_size] = frame |
spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins] |
return spectrogram.T |