Sonal Kumar
first commit
ed7a497
raw
history blame
14.7 kB
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Audio processing functions to extract feature from a raw audio. Should all be in numpy to support all frameworks, and
remmove unecessary dependencies.
"""
import math
import warnings
from typing import Optional
import numpy as np
from numpy.fft import fft
def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float:
"""Convert Hertz to Mels.
Args:
freqs (`float`):
Frequencies in Hertz
mel_scale (`str`, *optional*, defaults to `"htk"`):
Scale to use, `htk` or `slaney`.
Returns:
mels (`float`):
Frequency in Mels
"""
if mel_scale not in ["slaney", "htk"]:
raise ValueError('mel_scale should be one of "htk" or "slaney".')
if mel_scale == "htk":
return 2595.0 * math.log10(1.0 + (freq / 700.0))
# Fill in the linear part
frequency_min = 0.0
f_sp = 200.0 / 3
mels = (freq - frequency_min) / f_sp
# Fill in the log-scale part
min_log_hertz = 1000.0
min_log_mel = (min_log_hertz - frequency_min) / f_sp
logstep = math.log(6.4) / 27.0
if freq >= min_log_hertz:
mels = min_log_mel + math.log(freq / min_log_hertz) / logstep
return mels
def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
"""Convert mel bin numbers to frequencies.
Args:
mels (`np.array`):
Mel frequencies
mel_scale (`str`, *optional*, `"htk"`):
Scale to use: `htk` or `slaney`.
Returns:
freqs (`np.array`):
Mels converted to Hertz
"""
if mel_scale not in ["slaney", "htk"]:
raise ValueError('mel_scale should be one of "htk" or "slaney".')
if mel_scale == "htk":
return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
# Fill in the linear scale
frequency_min = 0.0
f_sp = 200.0 / 3
freqs = frequency_min + f_sp * mels
# And now the nonlinear scale
min_log_hertz = 1000.0
min_log_mel = (min_log_hertz - frequency_min) / f_sp
logstep = math.log(6.4) / 27.0
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hertz * np.exp(logstep * (mels[log_t] - min_log_mel))
return freqs
def _create_triangular_filterbank(
all_freqs: np.array,
f_pts: np.array,
) -> np.array:
"""Create a triangular filter bank.
Args:
all_freqs (`np.array` of shape (`nb_frequency_bins`, )):
Discrete frequencies used when the STFT was computed.
f_pts (`np.array`, of shape (`nb_mel_filters`, )):
Coordinates of the middle points of the triangular filters to create.
Returns:
fb (np.array):
The filter bank of size (`nb_frequency_bins`, `nb_mel_filters`).
"""
# Adapted from Librosa
# calculate the difference between each filter mid point and each stft freq point in hertz
f_diff = f_pts[1:] - f_pts[:-1] # (n_filter + 1)
slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) # (nb_frequency_bins, n_filter + 2)
# create overlapping triangles
zero = np.zeros(1)
down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (nb_frequency_bins, n_filter)
up_slopes = slopes[:, 2:] / f_diff[1:] # (nb_frequency_bins, n_filter)
fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
return fb
def get_mel_filter_banks(
nb_frequency_bins: int,
nb_mel_filters: int,
frequency_min: float,
frequency_max: float,
sample_rate: int,
norm: Optional[str] = None,
mel_scale: str = "htk",
) -> np.array:
"""
Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter bank*,
and various implementation exist, which differ in the number of filters, the shape of the filters, the way the
filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
This code is heavily inspired from the *torchaudio* implementation, see
[here](https://pytorch.org/audio/stable/transforms.html) for more details.
Tips:
- Different banks of Mel filters were introduced in the litterature. The following variation are supported:
- MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
and a speech bandwidth of `[0, 4600]` Hertz
- MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz).
- MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
- HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
- The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
uses the `"slaney"` implementation.
Args:
nb_frequency_bins (`int`):
Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
nb_mel_filters (`int`):
Number of Mel filers to generate.
frequency_min (`float`):
Minimum frequency of interest(Hertz).
frequency_max (`float`):
Maximum frequency of interest(Hertz).
sample_rate (`int`):
Sample rate of the audio waveform.
norm (`str`, *optional*):
If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization).
mel_scale (`str`, *optional*, defaults to `"htk"`):
Scale to use: `"htk"` or `"slaney"`.
Returns:
`np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix
is a projection matrix to go from a spectrogram to a Mel Spectrogram.
"""
if norm is not None and norm != "slaney":
raise ValueError('norm must be one of None or "slaney"')
# freqency bins
all_freqs = np.linspace(0, sample_rate // 2, nb_frequency_bins)
# Compute mim and max frequencies in mel scale
m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale)
m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale)
# create the centers of the triangular mel filters.
m_pts = np.linspace(m_min, m_max, nb_mel_filters + 2)
f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale)
# create the filterbank
filterbank = _create_triangular_filterbank(all_freqs, f_pts)
if norm is not None and norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (f_pts[2 : nb_mel_filters + 2] - f_pts[:nb_mel_filters])
filterbank *= np.expand_dims(enorm, 0)
if (filterbank.max(axis=0) == 0.0).any():
warnings.warn(
"At least one mel filterbank has all zero values. "
f"The value for `nb_mel_filters` ({nb_mel_filters}) may be set too high. "
f"Or, the value for `nb_frequency_bins` ({nb_frequency_bins}) may be set too low."
)
return filterbank
def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
"""
Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability.
Tips:
- The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on
a
linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into
it.
- This means that large variations in energy may not sound all that different if the sound is loud to begin
with. This compression operation makes the mel features match more closely what humans actually hear.
Args:
mel_spectrogram (`np.array`):
Input mel spectrogram.
top_db (`int`, *optional*):
The maximum decibel value.
a_min (`int`, *optional*, default to 1e-10):
Minimum value to use when cliping the mel spectrogram.
ref (`float`, *optional*, default to 1.0):
Maximum reference value used to scale the mel_spectrogram.
"""
log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
if top_db is not None:
if top_db < 0:
raise ValueError("top_db must be non-negative")
log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf)
return log_spec
# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
"""
In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
segments called `frames`.
The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
defines the step between the beginning of each new frame.
Args:
waveform (`np.array` of shape `(sample_length,)`):
The raw waveform which will be split into smaller chunks.
hop_length (`int`, *optional*, defaults to 160):
Step between each window of the waveform.
fft_window_size (`int`, *optional*, defaults to 400):
Defines the size of the window.
center (`bool`, defaults to `True`):
Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
waveform on the left and on the right.
Return:
framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
The framed waveforms that can be fed to `np.fft`.
"""
frames = []
for i in range(0, waveform.shape[0] + 1, hop_length):
if center:
half_window = (fft_window_size - 1) // 2 + 1
start = i - half_window if i > half_window else 0
end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
frame = waveform[start:end]
if start == 0:
padd_width = (-i + half_window, 0)
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
elif end == waveform.shape[0]:
padd_width = (0, (i - waveform.shape[0] + half_window))
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
else:
frame = waveform[i : i + fft_window_size]
frame_width = frame.shape[0]
if frame_width < waveform.shape[0]:
frame = np.lib.pad(
frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
)
frames.append(frame)
frames = np.stack(frames, 0)
return frames
# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
"""
Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
as `torch.stft`.
Args:
frames (`np.array` of dimension `(num_frames, fft_window_size)`):
A framed audio signal obtained using `audio_utils.fram_wav`.
windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
For more information on the discontinuities, called *Spectral leakage*, refer to [this
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
fft_window_size (`int`, *optional*):
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
Example:
```python
>>> from transformers.audio_utils import stft, fram_wave
>>> import numpy as np
>>> audio = np.random.rand(50)
>>> fft_window_size = 10
>>> hop_length = 2
>>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
>>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
```
Returns:
spectrogram (`np.ndarray`):
A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
"""
frame_size = frames.shape[1]
if fft_window_size is None:
fft_window_size = frame_size
if fft_window_size < frame_size:
raise ValueError("FFT size must greater or equal the frame size")
# number of FFT bins to store
nb_frequency_bins = (fft_window_size >> 1) + 1
spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
fft_signal = np.zeros(fft_window_size)
for f, frame in enumerate(frames):
if windowing_function is not None:
np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
else:
fft_signal[:frame_size] = frame
spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins]
return spectrogram.T