Persian_ASR / sad_tf /sidekit_mfcc.py
imansarraf's picture
Upload 9 files
84b1bab verified
raw
history blame
14.1 kB
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# The following code has been copy-pasted from SIDEKIT source files:
# frontend/features.py frontend/io.py frontend/vad.py
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
:mod:`frontend` provides methods to process an audio signal in order to extract
useful parameters for speaker verification.
"""
import numpy
import soundfile
import scipy
from scipy.fftpack.realtransforms import dct
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
__maintainer__ = "Anthony Larcher"
__email__ = "[email protected]"
__status__ = "Production"
__docformat__ = 'reStructuredText'
wav_flag = "float32" # Could be "int16"
PARAM_TYPE = numpy.float32
def read_wav(input_file_name):
"""
:param input_file_name:
:return:
"""
#with wave.open(input_file_name, "r") as wfh:
# (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
# raw = wfh.readframes(nframes * nchannels)
# out = struct.unpack_from("%dh" % nframes * nchannels, raw)
# sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
# return sig.astype(numpy.float32), framerate, sampwidth
nfo = soundfile.info(input_file_name)
sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
sig = sig.astype(numpy.float32)
return sig, sample_rate, 4
def hz2mel(f, htk=True):
"""Convert an array of frequency in Hz into mel.
:param f: frequency to convert
:return: the equivalence on the mel scale.
"""
if htk:
return 2595 * numpy.log10(1 + f / 700.)
else:
f = numpy.array(f)
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
f_0 = 0.
f_sp = 200. / 3.
brkfrq = 1000.
brkpt = (brkfrq - f_0) / f_sp
logstep = numpy.exp(numpy.log(6.4) / 27)
linpts = f < brkfrq
z = numpy.zeros_like(f)
# fill in parts separately
z[linpts] = (f[linpts] - f_0) / f_sp
z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
if z.shape == (1,):
return z[0]
else:
return z
def mel2hz(z, htk=True):
"""Convert an array of mel values in Hz.
:param m: ndarray of frequencies to convert in Hz.
:return: the equivalent values in Hertz.
"""
if htk:
return 700. * (10**(z / 2595.) - 1)
else:
z = numpy.array(z, dtype=float)
f_0 = 0
f_sp = 200. / 3.
brkfrq = 1000.
brkpt = (brkfrq - f_0) / f_sp
logstep = numpy.exp(numpy.log(6.4) / 27)
linpts = (z < brkpt)
f = numpy.zeros_like(z)
# fill in parts separately
f[linpts] = f_0 + f_sp * z[linpts]
f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
if f.shape == (1,):
return f[0]
else:
return f
def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
"""Compute triangular filterbank for cepstral coefficient computation.
:param fs: sampling frequency of the original signal.
:param nfft: number of points for the Fourier Transform
:param lowfreq: lower limit of the frequency band filtered
:param maxfreq: higher limit of the frequency band filtered
:param nlinfilt: number of linear filters to use in low frequencies
:param nlogfilt: number of log-linear filters to use in high frequencies
:param midfreq: frequency boundary between linear and log-linear filters
:return: the filter bank and the central frequencies of each filter
"""
# Total number of filters
nfilt = nlinfilt + nlogfilt
# ------------------------
# Compute the filter bank
# ------------------------
# Compute start/middle/end points of the triangular filters in spectral
# domain
frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
if nlogfilt == 0:
linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
elif nlinfilt == 0:
low_mel = hz2mel(lowfreq)
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2)
# mels[nlinfilt:]
melsc = (max_mel - low_mel) / (nfilt + 1)
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
# Back to the frequency domain
frequences = mel2hz(mels)
else:
# Compute linear filters on [0;1000Hz]
linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
# Compute log-linear filters on [1000;maxfreq]
low_mel = hz2mel(min([1000, maxfreq]))
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
melsc = (max_mel - low_mel) / (nlogfilt + 1)
# Verify that mel2hz(melsc)>linsc
while mel2hz(melsc) < linsc:
# in this case, we add a linear filter
nlinfilt += 1
nlogfilt -= 1
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
max_mel = hz2mel(maxfreq)
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
melsc = (max_mel - low_mel) / (nlogfilt + 1)
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
# Back to the frequency domain
frequences[nlinfilt:] = mel2hz(mels)
heights = 2. / (frequences[2:] - frequences[0:-2])
# Compute filterbank coeff (in fft domain, in bins)
fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
# FFT bins (in Hz)
n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
for i in range(nfilt):
low = frequences[i]
cen = frequences[i + 1]
hi = frequences[i + 2]
try:
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
except:
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
left_slope = heights[i] / (cen - low)
try:
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
except:
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
right_slope = heights[i] / (hi - cen)
fbank[i][lid] = left_slope * (n_frequences[lid] - low)
fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
return fbank, frequences
def power_spectrum(input_sig,
fs=8000,
win_time=0.025,
shift=0.01,
prefac=0.97):
"""
Compute the power spectrum of the signal.
:param input_sig:
:param fs:
:param win_time:
:param shift:
:param prefac:
:return:
"""
window_length = int(round(win_time * fs))
overlap = window_length - int(shift * fs)
framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed = pre_emphasis(framed, prefac)
l = framed.shape[0]
n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
# ham = numpy.hamming(window_length)
window = numpy.hanning(window_length)
spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
log_energy = numpy.log((framed**2).sum(axis=1))
dec = 500000
start = 0
stop = min(dec, l)
while start < l:
ahan = framed[start:stop, :] * window
mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
spec[start:stop, :] = mag.real**2 + mag.imag**2
start = stop
stop = min(stop + dec, l)
return spec, log_energy
def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
"""
:param sig: input signal, can be mono or multi dimensional
:param win_size: size of the window in term of samples
:param win_shift: shift of the sliding window in terme of samples
:param context: tuple of left and right context
:param pad: can be zeros or edge
"""
dsize = sig.dtype.itemsize
if sig.ndim == 1:
sig = sig[:, numpy.newaxis]
# Manage padding
c = (context, ) + (sig.ndim - 1) * ((0, 0), )
_win_size = win_size + sum(context)
shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
if pad == 'zeros':
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
shape=shape,
strides=strides).squeeze()
elif pad == 'edge':
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
shape=shape,
strides=strides).squeeze()
def pre_emphasis(input_sig, pre):
"""Pre-emphasis of an audio signal.
:param input_sig: the input vector of signal to pre emphasize
:param pre: value that defines the pre-emphasis filter.
"""
if input_sig.ndim == 1:
return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
else:
return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
def mfcc(input_sig,
lowfreq=100, maxfreq=8000,
nlinfilt=0, nlogfilt=24,
nwin=0.025,
fs=16000,
nceps=13,
shift=0.01,
get_spec=False,
get_mspec=False,
prefac=0.97):
"""Compute Mel Frequency Cepstral Coefficients.
:param input_sig: input signal from which the coefficients are computed.
Input audio is supposed to be RAW PCM 16bits
:param lowfreq: lower limit of the frequency band filtered.
Default is 100Hz.
:param maxfreq: higher limit of the frequency band filtered.
Default is 8000Hz.
:param nlinfilt: number of linear filters to use in low frequencies.
Default is 0.
:param nlogfilt: number of log-linear filters to use in high frequencies.
Default is 24.
:param nwin: length of the sliding window in seconds
Default is 0.025.
:param fs: sampling frequency of the original signal. Default is 16000Hz.
:param nceps: number of cepstral coefficients to extract.
Default is 13.
:param shift: shift between two analyses. Default is 0.01 (10ms).
:param get_spec: boolean, if true returns the spectrogram
:param get_mspec: boolean, if true returns the output of the filter banks
:param prefac: pre-emphasis filter value
:return: the cepstral coefficients in a ndaray as well as
the Log-spectrum in the mel-domain in a ndarray.
.. note:: MFCC are computed as follows:
- Pre-processing in time-domain (pre-emphasizing)
- Compute the spectrum amplitude by windowing with a Hamming window
- Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
linearly spaced on the mel scale, and have equal bandwith in the mel scale
- Compute the DCT of the log-spectrom
- Log-energy is returned as first coefficient of the feature vector.
For more details, refer to [Davis80]_.
"""
# Compute power spectrum
spec, log_energy = power_spectrum(input_sig,
fs,
win_time=nwin,
shift=shift,
prefac=prefac)
# Filter the spectrum through the triangle filter-bank
n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
# The C0 term is removed as it is the constant term
# ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
lst = list()
lst.append(None)
lst.append(log_energy)
if get_spec:
lst.append(spec)
else:
lst.append(None)
del spec
if get_mspec:
lst.append(mspec)
else:
lst.append(None)
del mspec
return lst