Spaces:

imansarraf
/

Persian_ASR

Sleeping

App Files Files Community

Persian_ASR / sad_tf /sidekit_mfcc.py

imansarraf

Upload 9 files

84b1bab verified 3 months ago

raw

history blame

14.1 kB


	# -- coding: utf-8 --
	#
	# This file is part of SIDEKIT.
	#
	# The following code has been copy-pasted from SIDEKIT source files:
	# frontend/features.py frontend/io.py frontend/vad.py
	#
	# SIDEKIT is a python package for speaker verification.
	# Home page: http://www-lium.univ-lemans.fr/sidekit/
	#
	# SIDEKIT is a python package for speaker verification.
	# Home page: http://www-lium.univ-lemans.fr/sidekit/
	#
	# SIDEKIT is free software: you can redistribute it and/or modify
	# it under the terms of the GNU LLesser General Public License as
	# published by the Free Software Foundation, either version 3 of the License,
	# or (at your option) any later version.
	#
	# SIDEKIT is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public License
	# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.

	"""
	Copyright 2014-2021 Anthony Larcher and Sylvain Meignier

	:mod:`frontend` provides methods to process an audio signal in order to extract
	useful parameters for speaker verification.
	"""


	import numpy
	import soundfile
	import scipy
	from scipy.fftpack.realtransforms import dct


	__author__ = "Anthony Larcher and Sylvain Meignier"
	__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
	__license__ = "LGPL"
	__maintainer__ = "Anthony Larcher"
	__email__ = "[email protected]"
	__status__ = "Production"
	__docformat__ = 'reStructuredText'




	wav_flag = "float32" # Could be "int16"
	PARAM_TYPE = numpy.float32


	def read_wav(input_file_name):
	"""
	:param input_file_name:
	:return:
	"""
	#with wave.open(input_file_name, "r") as wfh:
	# (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
	# raw = wfh.readframes(nframes * nchannels)
	# out = struct.unpack_from("%dh" % nframes * nchannels, raw)
	# sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
	# return sig.astype(numpy.float32), framerate, sampwidth
	nfo = soundfile.info(input_file_name)
	sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
	sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
	sig = sig.astype(numpy.float32)
	return sig, sample_rate, 4




	def hz2mel(f, htk=True):
	"""Convert an array of frequency in Hz into mel.

	:param f: frequency to convert

	:return: the equivalence on the mel scale.
	"""
	if htk:
	return 2595 * numpy.log10(1 + f / 700.)
	else:
	f = numpy.array(f)

	# Mel fn to match Slaney's Auditory Toolbox mfcc.m
	# Mel fn to match Slaney's Auditory Toolbox mfcc.m
	f_0 = 0.
	f_sp = 200. / 3.
	brkfrq = 1000.
	brkpt = (brkfrq - f_0) / f_sp
	logstep = numpy.exp(numpy.log(6.4) / 27)

	linpts = f < brkfrq

	z = numpy.zeros_like(f)
	# fill in parts separately
	z[linpts] = (f[linpts] - f_0) / f_sp
	z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)

	if z.shape == (1,):
	return z[0]
	else:
	return z

	def mel2hz(z, htk=True):
	"""Convert an array of mel values in Hz.

	:param m: ndarray of frequencies to convert in Hz.

	:return: the equivalent values in Hertz.
	"""
	if htk:
	return 700. * (10**(z / 2595.) - 1)
	else:
	z = numpy.array(z, dtype=float)
	f_0 = 0
	f_sp = 200. / 3.
	brkfrq = 1000.
	brkpt = (brkfrq - f_0) / f_sp
	logstep = numpy.exp(numpy.log(6.4) / 27)

	linpts = (z < brkpt)

	f = numpy.zeros_like(z)

	# fill in parts separately
	f[linpts] = f_0 + f_sp * z[linpts]
	f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))

	if f.shape == (1,):
	return f[0]
	else:
	return f



	def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
	"""Compute triangular filterbank for cepstral coefficient computation.

	:param fs: sampling frequency of the original signal.
	:param nfft: number of points for the Fourier Transform
	:param lowfreq: lower limit of the frequency band filtered
	:param maxfreq: higher limit of the frequency band filtered
	:param nlinfilt: number of linear filters to use in low frequencies
	:param nlogfilt: number of log-linear filters to use in high frequencies
	:param midfreq: frequency boundary between linear and log-linear filters

	:return: the filter bank and the central frequencies of each filter
	"""
	# Total number of filters
	nfilt = nlinfilt + nlogfilt

	# ------------------------
	# Compute the filter bank
	# ------------------------
	# Compute start/middle/end points of the triangular filters in spectral
	# domain
	frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
	if nlogfilt == 0:
	linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
	frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
	elif nlinfilt == 0:
	low_mel = hz2mel(lowfreq)
	max_mel = hz2mel(maxfreq)
	mels = numpy.zeros(nlogfilt + 2)
	# mels[nlinfilt:]
	melsc = (max_mel - low_mel) / (nfilt + 1)
	mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
	# Back to the frequency domain
	frequences = mel2hz(mels)
	else:
	# Compute linear filters on [0;1000Hz]
	linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
	frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
	# Compute log-linear filters on [1000;maxfreq]
	low_mel = hz2mel(min([1000, maxfreq]))
	max_mel = hz2mel(maxfreq)
	mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
	melsc = (max_mel - low_mel) / (nlogfilt + 1)

	# Verify that mel2hz(melsc)>linsc
	while mel2hz(melsc) < linsc:
	# in this case, we add a linear filter
	nlinfilt += 1
	nlogfilt -= 1
	frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
	low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
	max_mel = hz2mel(maxfreq)
	mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
	melsc = (max_mel - low_mel) / (nlogfilt + 1)

	mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
	# Back to the frequency domain
	frequences[nlinfilt:] = mel2hz(mels)

	heights = 2. / (frequences[2:] - frequences[0:-2])

	# Compute filterbank coeff (in fft domain, in bins)
	fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
	# FFT bins (in Hz)
	n_frequences = numpy.arange(nfft) / (1. * nfft) * fs

	for i in range(nfilt):
	low = frequences[i]
	cen = frequences[i + 1]
	hi = frequences[i + 2]
	try:
	lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
	except:
	lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
	left_slope = heights[i] / (cen - low)
	try:
	rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
	except:
	rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
	right_slope = heights[i] / (hi - cen)
	fbank[i][lid] = left_slope * (n_frequences[lid] - low)
	fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])

	return fbank, frequences


	def power_spectrum(input_sig,
	fs=8000,
	win_time=0.025,
	shift=0.01,
	prefac=0.97):
	"""
	Compute the power spectrum of the signal.
	:param input_sig:
	:param fs:
	:param win_time:
	:param shift:
	:param prefac:
	:return:
	"""
	window_length = int(round(win_time * fs))
	overlap = window_length - int(shift * fs)
	framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
	# Pre-emphasis filtering is applied after framing to be consistent with stream processing
	framed = pre_emphasis(framed, prefac)
	l = framed.shape[0]
	n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
	# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
	# ham = numpy.hamming(window_length)
	window = numpy.hanning(window_length)

	spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
	log_energy = numpy.log((framed**2).sum(axis=1))
	dec = 500000
	start = 0
	stop = min(dec, l)
	while start < l:
	ahan = framed[start:stop, :] * window
	mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
	spec[start:stop, :] = mag.real2 + mag.imag2
	start = stop
	stop = min(stop + dec, l)

	return spec, log_energy


	def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
	"""
	:param sig: input signal, can be mono or multi dimensional
	:param win_size: size of the window in term of samples
	:param win_shift: shift of the sliding window in terme of samples
	:param context: tuple of left and right context
	:param pad: can be zeros or edge
	"""
	dsize = sig.dtype.itemsize
	if sig.ndim == 1:
	sig = sig[:, numpy.newaxis]
	# Manage padding
	c = (context, ) + (sig.ndim - 1) * ((0, 0), )
	_win_size = win_size + sum(context)
	shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
	strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
	if pad == 'zeros':
	return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
	shape=shape,
	strides=strides).squeeze()
	elif pad == 'edge':
	return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
	shape=shape,
	strides=strides).squeeze()


	def pre_emphasis(input_sig, pre):
	"""Pre-emphasis of an audio signal.
	:param input_sig: the input vector of signal to pre emphasize
	:param pre: value that defines the pre-emphasis filter.
	"""
	if input_sig.ndim == 1:
	return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
	input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
	else:
	return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre


	def mfcc(input_sig,
	lowfreq=100, maxfreq=8000,
	nlinfilt=0, nlogfilt=24,
	nwin=0.025,
	fs=16000,
	nceps=13,
	shift=0.01,
	get_spec=False,
	get_mspec=False,
	prefac=0.97):
	"""Compute Mel Frequency Cepstral Coefficients.

	:param input_sig: input signal from which the coefficients are computed.
	Input audio is supposed to be RAW PCM 16bits
	:param lowfreq: lower limit of the frequency band filtered.
	Default is 100Hz.
	:param maxfreq: higher limit of the frequency band filtered.
	Default is 8000Hz.
	:param nlinfilt: number of linear filters to use in low frequencies.
	Default is 0.
	:param nlogfilt: number of log-linear filters to use in high frequencies.
	Default is 24.
	:param nwin: length of the sliding window in seconds
	Default is 0.025.
	:param fs: sampling frequency of the original signal. Default is 16000Hz.
	:param nceps: number of cepstral coefficients to extract.
	Default is 13.
	:param shift: shift between two analyses. Default is 0.01 (10ms).
	:param get_spec: boolean, if true returns the spectrogram
	:param get_mspec: boolean, if true returns the output of the filter banks
	:param prefac: pre-emphasis filter value

	:return: the cepstral coefficients in a ndaray as well as
	the Log-spectrum in the mel-domain in a ndarray.

	.. note:: MFCC are computed as follows:

	- Pre-processing in time-domain (pre-emphasizing)
	- Compute the spectrum amplitude by windowing with a Hamming window
	- Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
	linearly spaced on the mel scale, and have equal bandwith in the mel scale
	- Compute the DCT of the log-spectrom
	- Log-energy is returned as first coefficient of the feature vector.

	For more details, refer to [Davis80]_.
	"""
	# Compute power spectrum
	spec, log_energy = power_spectrum(input_sig,
	fs,
	win_time=nwin,
	shift=shift,
	prefac=prefac)
	# Filter the spectrum through the triangle filter-bank
	n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
	fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]

	mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
	# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
	# The C0 term is removed as it is the constant term
	# ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
	lst = list()
	lst.append(None)
	lst.append(log_energy)
	if get_spec:
	lst.append(spec)
	else:
	lst.append(None)
	del spec
	if get_mspec:
	lst.append(mspec)
	else:
	lst.append(None)
	del mspec

	return lst