Spaces:
Running
Running
alibabasglab
commited on
Upload 73 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- __init__.py +169 -0
- __pycache__/basis.cpython-38.pyc +0 -0
- __pycache__/metric_loader.cpython-38.pyc +0 -0
- __pycache__/metrics.cpython-38.pyc +0 -0
- __pycache__/speechscore.cpython-38.pyc +0 -0
- audios/clean/audio_1.wav +0 -0
- audios/clean/audio_2.wav +0 -0
- audios/noisy/audio_1.wav +0 -0
- audios/noisy/audio_2.wav +0 -0
- audios/ref.wav +0 -0
- audios/test.wav +0 -0
- basis.py +113 -0
- demo.py +29 -0
- requirement.txt +5 -0
- scores/__init__.py +0 -0
- scores/__pycache__/__init__.cpython-38.pyc +0 -0
- scores/__pycache__/bsseval.cpython-38.pyc +0 -0
- scores/__pycache__/cbak.cpython-38.pyc +0 -0
- scores/__pycache__/covl.cpython-38.pyc +0 -0
- scores/__pycache__/csig.cpython-38.pyc +0 -0
- scores/__pycache__/fwsegsnr.cpython-38.pyc +0 -0
- scores/__pycache__/helper.cpython-38.pyc +0 -0
- scores/__pycache__/llr.cpython-38.pyc +0 -0
- scores/__pycache__/lsd.cpython-38.pyc +0 -0
- scores/__pycache__/mcd.cpython-38.pyc +0 -0
- scores/__pycache__/nb_pesq.cpython-38.pyc +0 -0
- scores/__pycache__/pesq.cpython-38.pyc +0 -0
- scores/__pycache__/sisdr.cpython-38.pyc +0 -0
- scores/__pycache__/snr.cpython-38.pyc +0 -0
- scores/__pycache__/ssnr.cpython-38.pyc +0 -0
- scores/__pycache__/stoi.cpython-38.pyc +0 -0
- scores/bsseval.py +21 -0
- scores/cbak.py +37 -0
- scores/covl.py +39 -0
- scores/csig.py +38 -0
- scores/dnsmos/DNSMOS/bak_ovr.onnx +3 -0
- scores/dnsmos/DNSMOS/model_v8.onnx +3 -0
- scores/dnsmos/DNSMOS/sig.onnx +3 -0
- scores/dnsmos/DNSMOS/sig_bak_ovr.onnx +3 -0
- scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc +0 -0
- scores/dnsmos/dnsmos.py +94 -0
- scores/fwsegsnr.py +49 -0
- scores/helper.py +307 -0
- scores/helper_bk.py +438 -0
- scores/llr.py +66 -0
- scores/lsd.py +30 -0
- scores/mcd.py +136 -0
- scores/mosnet/__init__.py +21 -0
- scores/mosnet/__pycache__/__init__.cpython-38.pyc +0 -0
- scores/mosnet/cnn_blstm.h5 +3 -0
__init__.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Metric:
|
2 |
+
def __init__(self, name, window, hop=None, verbose=False):
|
3 |
+
# the metric operates on some fixed rate only or only on mono ?
|
4 |
+
self.fixed_rate = None
|
5 |
+
self.mono = False
|
6 |
+
|
7 |
+
# is the metric absolute or relative ?
|
8 |
+
self.absolute = False
|
9 |
+
|
10 |
+
# length and hop of windows
|
11 |
+
self.window = window
|
12 |
+
if hop is None:
|
13 |
+
hop = window
|
14 |
+
self.hop = hop
|
15 |
+
self.name = name
|
16 |
+
self.verbose = verbose
|
17 |
+
|
18 |
+
def test_window(self, audios, rate):
|
19 |
+
raise NotImplementedError
|
20 |
+
|
21 |
+
def test(self, *test_files, array_rate=None):
|
22 |
+
"""loading sound files and making sure they all have the same lengths
|
23 |
+
(zero-padding to the largest). Also works with numpy arrays.
|
24 |
+
Then, calling the `test_window` function that should be specialised
|
25 |
+
depending on the metric."""
|
26 |
+
|
27 |
+
# imports
|
28 |
+
import soundfile as sf
|
29 |
+
import resampy
|
30 |
+
from museval.metrics import Framing
|
31 |
+
import numpy as np
|
32 |
+
|
33 |
+
audios = []
|
34 |
+
maxlen = 0
|
35 |
+
if isinstance(test_files, str):
|
36 |
+
test_files = [test_files]
|
37 |
+
if self.absolute and len(test_files) > 1:
|
38 |
+
if self.verbose:
|
39 |
+
print(' [%s] is absolute. Processing first file only'
|
40 |
+
% self.name)
|
41 |
+
test_files = [test_files[0],]
|
42 |
+
|
43 |
+
for file in test_files:
|
44 |
+
# Loading sound file
|
45 |
+
if isinstance(file, str):
|
46 |
+
audio, rate = sf.read(file, always_2d=True)
|
47 |
+
else:
|
48 |
+
rate = array_rate
|
49 |
+
if rate is None:
|
50 |
+
raise ValueError('Sampling rate needs to be specified '
|
51 |
+
'when feeding numpy arrays.')
|
52 |
+
audio = file
|
53 |
+
# Standardize shapes
|
54 |
+
if len(audio.shape) == 1:
|
55 |
+
audio = audio[:, None]
|
56 |
+
if len(audio.shape) != 2:
|
57 |
+
raise ValueError('Please provide 1D or 2D array, received '
|
58 |
+
'{}D array'.format(len(audio.shape)))
|
59 |
+
|
60 |
+
if self.fixed_rate is not None and rate != self.fixed_rate:
|
61 |
+
if self.verbose:
|
62 |
+
print(' [%s] preferred is %dkHz rate. resampling'
|
63 |
+
% (self.name, self.fixed_rate))
|
64 |
+
audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
|
65 |
+
rate = self.fixed_rate
|
66 |
+
if self.mono and audio.shape[1] > 1:
|
67 |
+
if self.verbose:
|
68 |
+
print(' [%s] only supports mono. Will use first channel'
|
69 |
+
% self.name)
|
70 |
+
audio = audio[..., 0, None]
|
71 |
+
if self.mono:
|
72 |
+
audio = audio[..., 0]
|
73 |
+
maxlen = max(maxlen, audio.shape[0])
|
74 |
+
audios += [audio]
|
75 |
+
|
76 |
+
for index, audio in enumerate(audios):
|
77 |
+
if audio.shape[0] != maxlen:
|
78 |
+
new = np.zeros((maxlen,) + audio.shape[1:])
|
79 |
+
new[:audio.shape[0]] = audio
|
80 |
+
audios[index] = new
|
81 |
+
|
82 |
+
if self.window is not None:
|
83 |
+
framer = Framing(self.window * rate,
|
84 |
+
self.hop * rate, maxlen)
|
85 |
+
nwin = framer.nwin
|
86 |
+
result = {}
|
87 |
+
for (t, win) in enumerate(framer):
|
88 |
+
result_t = self.test_window([audio[win] for audio in audios],
|
89 |
+
rate)
|
90 |
+
for metric in result_t.keys():
|
91 |
+
if metric not in result.keys():
|
92 |
+
result[metric] = np.empty(nwin)
|
93 |
+
result[metric][t] = result_t[metric]
|
94 |
+
else:
|
95 |
+
result = self.test_window(audios, rate)
|
96 |
+
return result
|
97 |
+
|
98 |
+
|
99 |
+
import absolute
|
100 |
+
import relative
|
101 |
+
|
102 |
+
|
103 |
+
class MetricsList:
|
104 |
+
def __init__(self):
|
105 |
+
self.metrics = []
|
106 |
+
|
107 |
+
def __add__(self, metric):
|
108 |
+
self.metrics += [metric]
|
109 |
+
return self
|
110 |
+
|
111 |
+
def __str__(self):
|
112 |
+
return 'Metrics: ' + ' '.join([x.name for x in self.metrics])
|
113 |
+
|
114 |
+
def __call__(self, *files, rate=None):
|
115 |
+
result = {}
|
116 |
+
for metric in self.metrics:
|
117 |
+
result_metric = metric.test(*files, array_rate=rate)
|
118 |
+
for name in result_metric.keys():
|
119 |
+
result[name] = result_metric[name]
|
120 |
+
return result
|
121 |
+
|
122 |
+
|
123 |
+
def load(metrics='', window=2, verbose=False):
|
124 |
+
""" Load the desired metrics inside a Metrics object that can then
|
125 |
+
be called to compute all the desired metrics.
|
126 |
+
|
127 |
+
Parameters:
|
128 |
+
----------
|
129 |
+
metrics: str or list of str
|
130 |
+
the metrics matching any of these will be automatically loaded. this
|
131 |
+
match is relative to the structure of the speechmetrics package.
|
132 |
+
For instance:
|
133 |
+
* 'absolute' will match all absolute metrics
|
134 |
+
* 'absolute.srmr' or 'srmr' will only match SRMR
|
135 |
+
* '' will match all
|
136 |
+
|
137 |
+
window: float
|
138 |
+
the window length to use for testing the files.
|
139 |
+
|
140 |
+
verbose: boolean
|
141 |
+
will display information during computations
|
142 |
+
|
143 |
+
Returns:
|
144 |
+
--------
|
145 |
+
|
146 |
+
A MetricsList object, that can be run to get the desired metrics
|
147 |
+
"""
|
148 |
+
import pkgutil
|
149 |
+
import importlib
|
150 |
+
|
151 |
+
result = MetricsList()
|
152 |
+
|
153 |
+
found_modules = []
|
154 |
+
iterator = pkgutil.walk_packages(__path__, __name__ + '.')
|
155 |
+
|
156 |
+
if isinstance(metrics, str):
|
157 |
+
metrics = [metrics]
|
158 |
+
for module_info in iterator:
|
159 |
+
if any([metric in module_info.name for metric in metrics]):
|
160 |
+
module = importlib.import_module(module_info.name)
|
161 |
+
if module not in found_modules:
|
162 |
+
found_modules += [module],
|
163 |
+
if hasattr(module, 'load'):
|
164 |
+
load_function = getattr(module, 'load')
|
165 |
+
new_metric = load_function(window)
|
166 |
+
new_metric.verbose = verbose
|
167 |
+
result += new_metric
|
168 |
+
print('Loaded ', module_info.name)
|
169 |
+
return result
|
__pycache__/basis.cpython-38.pyc
ADDED
Binary file (1.57 kB). View file
|
|
__pycache__/metric_loader.cpython-38.pyc
ADDED
Binary file (3.48 kB). View file
|
|
__pycache__/metrics.cpython-38.pyc
ADDED
Binary file (2.72 kB). View file
|
|
__pycache__/speechscore.cpython-38.pyc
ADDED
Binary file (5.95 kB). View file
|
|
audios/clean/audio_1.wav
ADDED
Binary file (76.8 kB). View file
|
|
audios/clean/audio_2.wav
ADDED
Binary file (76.8 kB). View file
|
|
audios/noisy/audio_1.wav
ADDED
Binary file (76.8 kB). View file
|
|
audios/noisy/audio_2.wav
ADDED
Binary file (76.8 kB). View file
|
|
audios/ref.wav
ADDED
Binary file (76.8 kB). View file
|
|
audios/test.wav
ADDED
Binary file (76.8 kB). View file
|
|
basis.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class ScoreBasis:
|
2 |
+
def __init__(self, name=None):
|
3 |
+
# the score operates on the specified rate
|
4 |
+
self.score_rate = None
|
5 |
+
# is the score intrusive or non-intrusive ?
|
6 |
+
self.intrusive = True #require a reference
|
7 |
+
self.name = name
|
8 |
+
|
9 |
+
def windowed_scoring(self, audios, score_rate):
|
10 |
+
raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented')
|
11 |
+
|
12 |
+
def scoring(self, data, window=None, score_rate=None):
|
13 |
+
""" calling the `windowed_scoring` function that should be specialised
|
14 |
+
depending on the score."""
|
15 |
+
|
16 |
+
# imports
|
17 |
+
#import soundfile as sf
|
18 |
+
import resampy
|
19 |
+
from museval.metrics import Framing
|
20 |
+
|
21 |
+
#checking rate
|
22 |
+
audios = data['audio']
|
23 |
+
score_rate = data['rate']
|
24 |
+
|
25 |
+
if self.score_rate is not None:
|
26 |
+
score_rate = self.score_rate
|
27 |
+
|
28 |
+
if score_rate != data['rate']:
|
29 |
+
for index, audio in enumerate(audios):
|
30 |
+
audio = resampy.resample(audio, data['rate'], score_rate, axis=0)
|
31 |
+
audios[index] = audio
|
32 |
+
|
33 |
+
if window is not None:
|
34 |
+
framer = Framing(window * score_rate, window * score_rate, maxlen)
|
35 |
+
nwin = framer.nwin
|
36 |
+
result = {}
|
37 |
+
for (t, win) in enumerate(framer):
|
38 |
+
result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
|
39 |
+
result[t] = result_t
|
40 |
+
else:
|
41 |
+
result = self.windowed_scoring(audios, score_rate)
|
42 |
+
return result
|
43 |
+
"""
|
44 |
+
audios = []
|
45 |
+
maxlen = 0
|
46 |
+
if isinstance(test_files, str):
|
47 |
+
test_files = [test_files]
|
48 |
+
print(f'test_files: {test_files}')
|
49 |
+
if not self.intrusive and len(test_files) > 1:
|
50 |
+
if self.verbose:
|
51 |
+
print(' [%s] is non-intrusive. Processing first file only'
|
52 |
+
% self.name)
|
53 |
+
test_files = [test_files[0],]
|
54 |
+
for file in test_files:
|
55 |
+
# Loading sound file
|
56 |
+
if isinstance(file, str):
|
57 |
+
audio, rate = sf.read(file, always_2d=True)
|
58 |
+
else:
|
59 |
+
rate = array_rate
|
60 |
+
if rate is None:
|
61 |
+
raise ValueError('Sampling rate needs to be specified '
|
62 |
+
'when feeding numpy arrays.')
|
63 |
+
audio = file
|
64 |
+
# Standardize shapes
|
65 |
+
if len(audio.shape) == 1:
|
66 |
+
audio = audio[:, None]
|
67 |
+
if len(audio.shape) != 2:
|
68 |
+
raise ValueError('Please provide 1D or 2D array, received '
|
69 |
+
'{}D array'.format(len(audio.shape)))
|
70 |
+
|
71 |
+
if self.fixed_rate is not None and rate != self.fixed_rate:
|
72 |
+
if self.verbose:
|
73 |
+
print(' [%s] preferred is %dkHz rate. resampling'
|
74 |
+
% (self.name, self.fixed_rate))
|
75 |
+
audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
|
76 |
+
rate = self.fixed_rate
|
77 |
+
if self.mono and audio.shape[1] > 1:
|
78 |
+
if self.verbose:
|
79 |
+
print(' [%s] only supports mono. Will use first channel'
|
80 |
+
% self.name)
|
81 |
+
audio = audio[..., 0, None]
|
82 |
+
if self.mono:
|
83 |
+
audio = audio[..., 0]
|
84 |
+
maxlen = max(maxlen, audio.shape[0])
|
85 |
+
audios += [audio]
|
86 |
+
audio = audios[1]
|
87 |
+
audio[:maxlen-320] = audio[320:]
|
88 |
+
audios[1] = audio
|
89 |
+
for index, audio in enumerate(audios):
|
90 |
+
if audio.shape[0] != maxlen:
|
91 |
+
new = np.zeros((maxlen,) + audio.shape[1:])
|
92 |
+
new[:audio.shape[0]] = audio
|
93 |
+
audios[index] = new
|
94 |
+
|
95 |
+
if self.window is not None:
|
96 |
+
framer = Framing(self.window * rate,
|
97 |
+
self.hop * rate, maxlen)
|
98 |
+
nwin = framer.nwin
|
99 |
+
result = {}
|
100 |
+
for (t, win) in enumerate(framer):
|
101 |
+
result_t = self.test_window([audio[win] for audio in audios],
|
102 |
+
rate)
|
103 |
+
#or metric in result_t.keys():
|
104 |
+
# if metric not in result.keys():
|
105 |
+
# result[metric] = np.empty(nwin)
|
106 |
+
# result[metric][t] = result_t[metric]
|
107 |
+
result[t] = result_t
|
108 |
+
else:
|
109 |
+
result = self.test_window(audios, rate)
|
110 |
+
return result
|
111 |
+
"""
|
112 |
+
|
113 |
+
|
demo.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import pprint for pretty-printing the results in a more readable format
|
2 |
+
import pprint
|
3 |
+
# Import the SpeechScore class to evaluate speech quality metrics
|
4 |
+
from speechscore import SpeechScore
|
5 |
+
|
6 |
+
# Main block to ensure the code runs only when executed directly
|
7 |
+
if __name__ == '__main__':
|
8 |
+
# Initialize a SpeechScore object with a list of score metrics to be evaluated
|
9 |
+
# Supports any subsets of the list
|
10 |
+
mySpeechScore = SpeechScore([
|
11 |
+
'SRMR', 'PESQ', 'NB_PESQ', 'STOI', 'SISDR',
|
12 |
+
'FWSEGSNR', 'LSD', 'BSSEval', 'DNSMOS',
|
13 |
+
'SNR', 'SSNR', 'LLR', 'CSIG', 'CBAK',
|
14 |
+
'COVL', 'MCD'
|
15 |
+
])
|
16 |
+
|
17 |
+
# Call the SpeechScore object to evaluate the speech metrics between 'noisy' and 'clean' audio
|
18 |
+
# Arguments:
|
19 |
+
# - {test_path, reference_path} supports audio directories or audio paths (.wav or .flac)
|
20 |
+
# - window (float): seconds, set None to specify no windowing (process the full audio)
|
21 |
+
# - score_rate (int): specifies the sampling rate at which the metrics should be computed
|
22 |
+
# - return_mean (bool): set True to specify that the mean score for each metric should be returned
|
23 |
+
scores = mySpeechScore(test_path='audios/noisy/', reference_path='audios/clean/', window=None, score_rate=16000, return_mean=True)
|
24 |
+
|
25 |
+
# Pretty-print the resulting scores in a readable format
|
26 |
+
pprint.pprint(scores)
|
27 |
+
|
28 |
+
# Print only the resulting mean scores in a readable format
|
29 |
+
pprint.pprint(scores['Mean_Score'])
|
requirement.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pysptk
|
2 |
+
pymcd
|
3 |
+
pyworld
|
4 |
+
fastdtw
|
5 |
+
museval
|
scores/__init__.py
ADDED
File without changes
|
scores/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (187 Bytes). View file
|
|
scores/__pycache__/bsseval.cpython-38.pyc
ADDED
Binary file (1.15 kB). View file
|
|
scores/__pycache__/cbak.cpython-38.pyc
ADDED
Binary file (1.48 kB). View file
|
|
scores/__pycache__/covl.cpython-38.pyc
ADDED
Binary file (1.53 kB). View file
|
|
scores/__pycache__/csig.cpython-38.pyc
ADDED
Binary file (1.52 kB). View file
|
|
scores/__pycache__/fwsegsnr.cpython-38.pyc
ADDED
Binary file (2.09 kB). View file
|
|
scores/__pycache__/helper.cpython-38.pyc
ADDED
Binary file (6.64 kB). View file
|
|
scores/__pycache__/llr.cpython-38.pyc
ADDED
Binary file (2.09 kB). View file
|
|
scores/__pycache__/lsd.cpython-38.pyc
ADDED
Binary file (1.5 kB). View file
|
|
scores/__pycache__/mcd.cpython-38.pyc
ADDED
Binary file (4.65 kB). View file
|
|
scores/__pycache__/nb_pesq.cpython-38.pyc
ADDED
Binary file (922 Bytes). View file
|
|
scores/__pycache__/pesq.cpython-38.pyc
ADDED
Binary file (921 Bytes). View file
|
|
scores/__pycache__/sisdr.cpython-38.pyc
ADDED
Binary file (1.2 kB). View file
|
|
scores/__pycache__/snr.cpython-38.pyc
ADDED
Binary file (1.52 kB). View file
|
|
scores/__pycache__/ssnr.cpython-38.pyc
ADDED
Binary file (2.05 kB). View file
|
|
scores/__pycache__/stoi.cpython-38.pyc
ADDED
Binary file (926 Bytes). View file
|
|
scores/bsseval.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from basis import ScoreBasis
|
3 |
+
|
4 |
+
|
5 |
+
class BSSEval(ScoreBasis):
|
6 |
+
def __init__(self):
|
7 |
+
super(BSSEval, self).__init__(name='BSSEval')
|
8 |
+
self.intrusive = False
|
9 |
+
|
10 |
+
def windowed_scoring(self, audios, score_rate):
|
11 |
+
bss_window = np.inf
|
12 |
+
bss_hop = np.inf
|
13 |
+
from museval.metrics import bss_eval
|
14 |
+
if len(audios) != 2:
|
15 |
+
raise ValueError('BSSEval needs a reference and a test signals.')
|
16 |
+
|
17 |
+
result = bss_eval(reference_sources=audios[1][None,...], # shape: [nsrc, nsample, nchannels]
|
18 |
+
estimated_sources=audios[0][None,...],
|
19 |
+
window=bss_window * score_rate,
|
20 |
+
hop=bss_hop * score_rate)
|
21 |
+
return {'SDR': result[0][0][0], 'ISR': result[1][0][0], 'SAR': result[3][0][0]}
|
scores/cbak.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import numpy as np
|
3 |
+
from pesq import pesq
|
4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
5 |
+
|
6 |
+
class CBAK(ScoreBasis):
|
7 |
+
def __init__(self):
|
8 |
+
super(CBAK, self).__init__(name='CBAK')
|
9 |
+
self.score_rate = 16000
|
10 |
+
self.intrusive = False
|
11 |
+
|
12 |
+
def windowed_scoring(self, audios, score_rate):
|
13 |
+
if len(audios) != 2:
|
14 |
+
raise ValueError('CBAK needs a reference and a test signals.')
|
15 |
+
return cal_CBAK(audios[0], audios[1], score_rate)
|
16 |
+
|
17 |
+
def cal_CBAK(target_wav, pred_wav, fs):
|
18 |
+
alpha = 0.95
|
19 |
+
|
20 |
+
# Compute WSS measure
|
21 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
22 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
23 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
24 |
+
|
25 |
+
# Compute the SSNR
|
26 |
+
snr_mean, segsnr_mean = SSNR(target_wav, pred_wav, fs)
|
27 |
+
segSNR = np.mean(segsnr_mean)
|
28 |
+
|
29 |
+
# Compute the PESQ
|
30 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
31 |
+
|
32 |
+
# Cbak
|
33 |
+
Cbak = 1.634 + 0.478 * pesq_raw - 0.007 * wss_dist + 0.063 * segSNR
|
34 |
+
Cbak = trim_mos(Cbak)
|
35 |
+
|
36 |
+
return Cbak
|
37 |
+
|
scores/covl.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import numpy as np
|
3 |
+
from pesq import pesq
|
4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
5 |
+
|
6 |
+
class COVL(ScoreBasis):
|
7 |
+
def __init__(self):
|
8 |
+
super(COVL, self).__init__(name='COVL')
|
9 |
+
self.score_rate = 16000
|
10 |
+
self.intrusive = False
|
11 |
+
|
12 |
+
def windowed_scoring(self, audios, score_rate):
|
13 |
+
if len(audios) != 2:
|
14 |
+
raise ValueError('COVL needs a reference and a test signals.')
|
15 |
+
return cal_COVL(audios[0], audios[1], score_rate)
|
16 |
+
|
17 |
+
def cal_COVL(target_wav, pred_wav, fs):
|
18 |
+
alpha = 0.95
|
19 |
+
|
20 |
+
# Compute WSS measure
|
21 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
22 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
23 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
24 |
+
|
25 |
+
# Compute LLR measure
|
26 |
+
LLR_dist = llr(target_wav, pred_wav, fs)
|
27 |
+
LLR_dist = sorted(LLR_dist, reverse=False)
|
28 |
+
LLRs = LLR_dist
|
29 |
+
LLR_len = round(len(LLR_dist) * alpha)
|
30 |
+
llr_mean = np.mean(LLRs[:LLR_len])
|
31 |
+
|
32 |
+
# Compute the PESQ
|
33 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
34 |
+
|
35 |
+
# Covl
|
36 |
+
Covl = 1.594 + 0.805 * pesq_raw - 0.512 * llr_mean - 0.007 * wss_dist
|
37 |
+
Covl = trim_mos(Covl)
|
38 |
+
|
39 |
+
return Covl
|
scores/csig.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import numpy as np
|
3 |
+
from pesq import pesq
|
4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
5 |
+
|
6 |
+
class CSIG(ScoreBasis):
|
7 |
+
def __init__(self):
|
8 |
+
super(CSIG, self).__init__(name='CSIG')
|
9 |
+
self.score_rate = 16000
|
10 |
+
|
11 |
+
def windowed_scoring(self, audios, score_rate):
|
12 |
+
if len(audios) != 2:
|
13 |
+
raise ValueError('CSIG needs a reference and a test signals.')
|
14 |
+
return cal_CSIG(audios[0], audios[1], score_rate)
|
15 |
+
|
16 |
+
def cal_CSIG(target_wav, pred_wav, fs):
|
17 |
+
alpha = 0.95
|
18 |
+
|
19 |
+
# Compute WSS measure
|
20 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
21 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
22 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
23 |
+
|
24 |
+
# Compute LLR measure
|
25 |
+
LLR_dist = llr(target_wav, pred_wav, fs)
|
26 |
+
LLR_dist = sorted(LLR_dist, reverse=False)
|
27 |
+
LLRs = LLR_dist
|
28 |
+
LLR_len = round(len(LLR_dist) * alpha)
|
29 |
+
llr_mean = np.mean(LLRs[:LLR_len])
|
30 |
+
|
31 |
+
# Compute the PESQ
|
32 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
33 |
+
|
34 |
+
# Csig
|
35 |
+
Csig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_raw - 0.009 * wss_dist
|
36 |
+
Csig = float(trim_mos(Csig))
|
37 |
+
|
38 |
+
return Csig
|
scores/dnsmos/DNSMOS/bak_ovr.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f335c90994618150192a656a474bcf8a9cbcedbc47965494ba8da79605d1308
|
3 |
+
size 742375
|
scores/dnsmos/DNSMOS/model_v8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91
|
3 |
+
size 224860
|
scores/dnsmos/DNSMOS/sig.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2fbdb293bc2366dfbae2b7477c490f981d24a8b4405efd3c11787569c6549d7
|
3 |
+
size 742203
|
scores/dnsmos/DNSMOS/sig_bak_ovr.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd
|
3 |
+
size 1157965
|
scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc
ADDED
Binary file (3.63 kB). View file
|
|
scores/dnsmos/dnsmos.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
import numpy.polynomial.polynomial as poly
|
6 |
+
import onnxruntime as ort
|
7 |
+
import soundfile as sf
|
8 |
+
|
9 |
+
SAMPLING_RATE = 16000
|
10 |
+
INPUT_LENGTH = 9.01
|
11 |
+
|
12 |
+
from basis import ScoreBasis
|
13 |
+
|
14 |
+
|
15 |
+
class DNSMOS(ScoreBasis):
|
16 |
+
def __init__(self):
|
17 |
+
super(DNSMOS, self).__init__(name='DNSMOS')
|
18 |
+
self.intrusive = True
|
19 |
+
self.score_rate = 16000
|
20 |
+
self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')
|
21 |
+
self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
|
22 |
+
self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
|
23 |
+
|
24 |
+
def windowed_scoring(self, audios, rate):
|
25 |
+
if len(audios) == 2:
|
26 |
+
return self.compute_score.cal_mos(audios[1], rate)
|
27 |
+
else:
|
28 |
+
return self.compute_score.cal_mos(audios[0], rate)
|
29 |
+
|
30 |
+
class ComputeScore:
|
31 |
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
32 |
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
33 |
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
34 |
+
|
35 |
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
36 |
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
|
37 |
+
if to_db:
|
38 |
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
|
39 |
+
return mel_spec.T
|
40 |
+
|
41 |
+
def get_polyfit_val(self, sig, bak, ovr):
|
42 |
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
43 |
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ])
|
44 |
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
45 |
+
|
46 |
+
sig_poly = p_sig(sig)
|
47 |
+
bak_poly = p_bak(bak)
|
48 |
+
ovr_poly = p_ovr(ovr)
|
49 |
+
|
50 |
+
return sig_poly, bak_poly, ovr_poly
|
51 |
+
|
52 |
+
def cal_mos(self, audio, sampling_rate):
|
53 |
+
fs = sampling_rate
|
54 |
+
actual_audio_len = len(audio)
|
55 |
+
len_samples = int(INPUT_LENGTH*fs)
|
56 |
+
while len(audio) < len_samples:
|
57 |
+
audio = np.append(audio, audio)
|
58 |
+
|
59 |
+
num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
|
60 |
+
hop_len_samples = fs
|
61 |
+
predicted_mos_sig_seg_raw = []
|
62 |
+
predicted_mos_bak_seg_raw = []
|
63 |
+
predicted_mos_ovr_seg_raw = []
|
64 |
+
predicted_mos_sig_seg = []
|
65 |
+
predicted_mos_bak_seg = []
|
66 |
+
predicted_mos_ovr_seg = []
|
67 |
+
predicted_p808_mos = []
|
68 |
+
|
69 |
+
for idx in range(num_hops):
|
70 |
+
audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
|
71 |
+
if len(audio_seg) < len_samples:
|
72 |
+
continue
|
73 |
+
|
74 |
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
|
75 |
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
76 |
+
oi = {'input_1': input_features}
|
77 |
+
p808_oi = {'input_1': p808_input_features}
|
78 |
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
79 |
+
mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
80 |
+
mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw)
|
81 |
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
82 |
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
83 |
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
84 |
+
predicted_mos_sig_seg.append(mos_sig)
|
85 |
+
predicted_mos_bak_seg.append(mos_bak)
|
86 |
+
predicted_mos_ovr_seg.append(mos_ovr)
|
87 |
+
predicted_p808_mos.append(p808_mos)
|
88 |
+
|
89 |
+
results = {}
|
90 |
+
results['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
91 |
+
results['SIG'] = np.mean(predicted_mos_sig_seg)
|
92 |
+
results['BAK'] = np.mean(predicted_mos_bak_seg)
|
93 |
+
results['P808_MOS'] = np.mean(predicted_p808_mos)
|
94 |
+
return results
|
scores/fwsegsnr.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
from basis import ScoreBasis
|
4 |
+
|
5 |
+
class FWSEGSNR(ScoreBasis):
|
6 |
+
def __init__(self):
|
7 |
+
super(FWSEGSNR, self).__init__(name='FWSEGSNR')
|
8 |
+
self.intrusive = False
|
9 |
+
|
10 |
+
def windowed_scoring(self, audios, score_rate):
|
11 |
+
if len(audios) != 2:
|
12 |
+
raise ValueError('FWSEGSNR needs a reference and a test signals.')
|
13 |
+
return fwsegsnr(audios[1], audios[0], score_rate)
|
14 |
+
|
15 |
+
def fwsegsnr(x, y, fs, frame_sz = 0.025, shift_sz= 0.01, win='hann', numband=23):
|
16 |
+
epsilon = np.finfo(np.float32).eps
|
17 |
+
frame = int(np.fix(frame_sz * fs))
|
18 |
+
shift = int(np.fix(shift_sz * fs))
|
19 |
+
window = win
|
20 |
+
nband = numband
|
21 |
+
noverlap = frame - shift
|
22 |
+
fftpt = int(2**np.ceil(np.log2(np.abs(frame))))
|
23 |
+
x = x / np.sqrt(sum(np.power(x, 2)))
|
24 |
+
y = y / np.sqrt(sum(np.power(y, 2)))
|
25 |
+
|
26 |
+
assert len(x) == len(y), print('Wav length are not matched!')
|
27 |
+
X_stft = np.abs(librosa.stft(x, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
|
28 |
+
Y_stft = np.abs(librosa.stft(y, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
|
29 |
+
|
30 |
+
num_freq = X_stft.shape[0]
|
31 |
+
num_frame = X_stft.shape[1]
|
32 |
+
|
33 |
+
X_mel = librosa.feature.melspectrogram(S=X_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
|
34 |
+
Y_mel = librosa.feature.melspectrogram(S=Y_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
|
35 |
+
|
36 |
+
# Calculate SNR.
|
37 |
+
|
38 |
+
W = np.power(Y_mel, 0.2)
|
39 |
+
E = X_mel - Y_mel
|
40 |
+
E[E == 0.0] = epsilon
|
41 |
+
E_power = np.power(E, 2)
|
42 |
+
Y_div_E = np.divide((np.power(Y_mel,2)), (np.power(E,2)))
|
43 |
+
Y_div_E[Y_div_E==0] = epsilon
|
44 |
+
ds = 10 * np.divide(np.sum(np.multiply(W, np.log10(Y_div_E)), 1), np.sum(W, 1))
|
45 |
+
ds[ds > 35] = 35
|
46 |
+
ds[ds < -10] = -10
|
47 |
+
d = np.mean(ds)
|
48 |
+
return d
|
49 |
+
|
scores/helper.py
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modifications in Metrics
|
3 |
+
|
4 |
+
# Original copyright:
|
5 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
6 |
+
# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
|
7 |
+
"""
|
8 |
+
import numpy as np
|
9 |
+
from scipy.linalg import toeplitz
|
10 |
+
|
11 |
+
# ----------------------------- HELPERS ------------------------------------ #
|
12 |
+
def trim_mos(val):
|
13 |
+
return min(max(val, 1), 5)
|
14 |
+
|
15 |
+
def lpcoeff(speech_frame, model_order):
|
16 |
+
# (1) Compute Autocor lags
|
17 |
+
winlength = speech_frame.shape[0]
|
18 |
+
R = []
|
19 |
+
for k in range(model_order + 1):
|
20 |
+
first = speech_frame[:(winlength - k)]
|
21 |
+
second = speech_frame[k:winlength]
|
22 |
+
R.append(np.sum(first * second))
|
23 |
+
|
24 |
+
# (2) Lev-Durbin
|
25 |
+
a = np.ones((model_order,))
|
26 |
+
E = np.zeros((model_order + 1,))
|
27 |
+
rcoeff = np.zeros((model_order,))
|
28 |
+
E[0] = R[0]
|
29 |
+
for i in range(model_order):
|
30 |
+
if i == 0:
|
31 |
+
sum_term = 0
|
32 |
+
else:
|
33 |
+
a_past = a[:i]
|
34 |
+
sum_term = np.sum(a_past * np.array(R[i:0:-1]))
|
35 |
+
rcoeff[i] = (R[i+1] - sum_term)/E[i]
|
36 |
+
a[i] = rcoeff[i]
|
37 |
+
if i > 0:
|
38 |
+
a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
|
39 |
+
E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
|
40 |
+
acorr = np.array(R, dtype=np.float32)
|
41 |
+
refcoeff = np.array(rcoeff, dtype=np.float32)
|
42 |
+
a = a * -1
|
43 |
+
lpparams = np.array([1] + list(a), dtype=np.float32)
|
44 |
+
acorr = np.array(acorr, dtype=np.float32)
|
45 |
+
refcoeff = np.array(refcoeff, dtype=np.float32)
|
46 |
+
lpparams = np.array(lpparams, dtype=np.float32)
|
47 |
+
|
48 |
+
return acorr, refcoeff, lpparams
|
49 |
+
# -------------------------------------------------------------------------- #
|
50 |
+
|
51 |
+
|
52 |
+
def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
|
53 |
+
""" Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
54 |
+
This function implements the segmental signal-to-noise ratio
|
55 |
+
as defined in [1, p. 45] (see Equation 2.12).
|
56 |
+
"""
|
57 |
+
clean_speech = ref_wav
|
58 |
+
processed_speech = deg_wav
|
59 |
+
clean_length = ref_wav.shape[0]
|
60 |
+
processed_length = deg_wav.shape[0]
|
61 |
+
|
62 |
+
# scale both to have same dynamic range. Remove DC too.
|
63 |
+
clean_speech -= clean_speech.mean()
|
64 |
+
processed_speech -= processed_speech.mean()
|
65 |
+
processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
|
66 |
+
|
67 |
+
# Signal-to-Noise Ratio
|
68 |
+
dif = ref_wav - deg_wav
|
69 |
+
overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
|
70 |
+
10e-20))
|
71 |
+
# global variables
|
72 |
+
winlength = int(np.round(30 * srate / 1000)) # 30 msecs
|
73 |
+
skiprate = winlength // 4
|
74 |
+
MIN_SNR = -10
|
75 |
+
MAX_SNR = 35
|
76 |
+
|
77 |
+
# For each frame, calculate SSNR
|
78 |
+
num_frames = int(clean_length / skiprate - (winlength/skiprate))
|
79 |
+
start = 0
|
80 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
81 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
82 |
+
segmental_snr = []
|
83 |
+
|
84 |
+
for frame_count in range(int(num_frames)):
|
85 |
+
# (1) get the frames for the test and ref speech.
|
86 |
+
# Apply Hanning Window
|
87 |
+
clean_frame = clean_speech[start:start+winlength]
|
88 |
+
processed_frame = processed_speech[start:start+winlength]
|
89 |
+
clean_frame = clean_frame * window
|
90 |
+
processed_frame = processed_frame * window
|
91 |
+
|
92 |
+
# (2) Compute Segmental SNR
|
93 |
+
signal_energy = np.sum(clean_frame ** 2)
|
94 |
+
noise_energy = np.sum((clean_frame - processed_frame) ** 2)
|
95 |
+
segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
|
96 |
+
segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
|
97 |
+
segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
|
98 |
+
start += int(skiprate)
|
99 |
+
return overall_snr, segmental_snr
|
100 |
+
|
101 |
+
|
102 |
+
def wss(ref_wav, deg_wav, srate):
|
103 |
+
clean_speech = ref_wav
|
104 |
+
processed_speech = deg_wav
|
105 |
+
clean_length = ref_wav.shape[0]
|
106 |
+
processed_length = deg_wav.shape[0]
|
107 |
+
|
108 |
+
assert clean_length == processed_length, clean_length
|
109 |
+
|
110 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
111 |
+
skiprate = np.floor(winlength / 4)
|
112 |
+
max_freq = srate / 2
|
113 |
+
num_crit = 25 # num of critical bands
|
114 |
+
|
115 |
+
USE_FFT_SPECTRUM = 1
|
116 |
+
n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
|
117 |
+
n_fftby2 = int(n_fft / 2)
|
118 |
+
Kmax = 20
|
119 |
+
Klocmax = 1
|
120 |
+
|
121 |
+
# Critical band filter definitions (Center frequency and BW in Hz)
|
122 |
+
cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
|
123 |
+
703.378, 798.717, 904.128, 1020.38, 1148.30,
|
124 |
+
1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
|
125 |
+
2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
|
126 |
+
3597.63]
|
127 |
+
bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
|
128 |
+
95.3398, 105.411, 116.256, 127.914, 140.423,
|
129 |
+
153.823, 168.154, 183.457, 199.776, 217.153,
|
130 |
+
235.631, 255.255, 276.072, 298.126, 321.465,
|
131 |
+
346.136]
|
132 |
+
|
133 |
+
bw_min = bandwidth[0] # min critical bandwidth
|
134 |
+
|
135 |
+
# set up critical band filters. Note here that Gaussianly shaped filters
|
136 |
+
# are used. Also, the sum of the filter weights are equivalent for each
|
137 |
+
# critical band filter. Filter less than -30 dB and set to zero.
|
138 |
+
min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
|
139 |
+
|
140 |
+
crit_filter = np.zeros((num_crit, n_fftby2))
|
141 |
+
all_f0 = []
|
142 |
+
for i in range(num_crit):
|
143 |
+
f0 = (cent_freq[i] / max_freq) * (n_fftby2)
|
144 |
+
all_f0.append(np.floor(f0))
|
145 |
+
bw = (bandwidth[i] / max_freq) * (n_fftby2)
|
146 |
+
norm_factor = np.log(bw_min) - np.log(bandwidth[i])
|
147 |
+
j = list(range(n_fftby2))
|
148 |
+
crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
|
149 |
+
norm_factor)
|
150 |
+
crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
|
151 |
+
min_factor)
|
152 |
+
|
153 |
+
# For each frame of input speech, compute Weighted Spectral Slope Measure
|
154 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
155 |
+
start = 0 # starting sample
|
156 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
157 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
158 |
+
distortion = []
|
159 |
+
|
160 |
+
for frame_count in range(num_frames):
|
161 |
+
# (1) Get the Frames for the test and reference speeech.
|
162 |
+
# Multiply by Hanning window.
|
163 |
+
clean_frame = clean_speech[start:start+winlength]
|
164 |
+
processed_frame = processed_speech[start:start+winlength]
|
165 |
+
clean_frame = clean_frame * window
|
166 |
+
processed_frame = processed_frame * window
|
167 |
+
|
168 |
+
# (2) Compuet Power Spectrum of clean and processed
|
169 |
+
clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
|
170 |
+
processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
|
171 |
+
clean_energy = [None] * num_crit
|
172 |
+
processed_energy = [None] * num_crit
|
173 |
+
|
174 |
+
# (3) Compute Filterbank output energies (in dB)
|
175 |
+
for i in range(num_crit):
|
176 |
+
clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
|
177 |
+
crit_filter[i, :])
|
178 |
+
processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
|
179 |
+
crit_filter[i, :])
|
180 |
+
clean_energy = np.array(clean_energy).reshape(-1, 1)
|
181 |
+
eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
|
182 |
+
clean_energy = np.concatenate((clean_energy, eps), axis=1)
|
183 |
+
clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
|
184 |
+
processed_energy = np.array(processed_energy).reshape(-1, 1)
|
185 |
+
processed_energy = np.concatenate((processed_energy, eps), axis=1)
|
186 |
+
processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
|
187 |
+
|
188 |
+
# (4) Compute Spectral Shape (dB[i+1] - dB[i])
|
189 |
+
clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
|
190 |
+
processed_slope = processed_energy[1:num_crit] - \
|
191 |
+
processed_energy[:num_crit-1]
|
192 |
+
|
193 |
+
# (5) Find the nearest peak locations in the spectra to each
|
194 |
+
# critical band. If the slope is negative, we search
|
195 |
+
# to the left. If positive, we search to the right.
|
196 |
+
clean_loc_peak = []
|
197 |
+
processed_loc_peak = []
|
198 |
+
for i in range(num_crit - 1):
|
199 |
+
if clean_slope[i] > 0:
|
200 |
+
# search to the right
|
201 |
+
n = i
|
202 |
+
while n < num_crit - 1 and clean_slope[n] > 0:
|
203 |
+
n += 1
|
204 |
+
clean_loc_peak.append(clean_energy[n - 1])
|
205 |
+
else:
|
206 |
+
# search to the left
|
207 |
+
n = i
|
208 |
+
while n >= 0 and clean_slope[n] <= 0:
|
209 |
+
n -= 1
|
210 |
+
clean_loc_peak.append(clean_energy[n + 1])
|
211 |
+
# find the peaks in the processed speech signal
|
212 |
+
if processed_slope[i] > 0:
|
213 |
+
n = i
|
214 |
+
while n < num_crit - 1 and processed_slope[n] > 0:
|
215 |
+
n += 1
|
216 |
+
processed_loc_peak.append(processed_energy[n - 1])
|
217 |
+
else:
|
218 |
+
n = i
|
219 |
+
while n >= 0 and processed_slope[n] <= 0:
|
220 |
+
n -= 1
|
221 |
+
processed_loc_peak.append(processed_energy[n + 1])
|
222 |
+
|
223 |
+
# (6) Compuet the WSS Measure for this frame. This includes
|
224 |
+
# determination of the weighting functino
|
225 |
+
dBMax_clean = max(clean_energy)
|
226 |
+
dBMax_processed = max(processed_energy)
|
227 |
+
|
228 |
+
# The weights are calculated by averaging individual
|
229 |
+
# weighting factors from the clean and processed frame.
|
230 |
+
# These weights W_clean and W_processed should range
|
231 |
+
# from 0 to 1 and place more emphasis on spectral
|
232 |
+
# peaks and less emphasis on slope differences in spectral
|
233 |
+
# valleys. This procedure is described on page 1280 of
|
234 |
+
# Klatt's 1982 ICASSP paper.
|
235 |
+
clean_loc_peak = np.array(clean_loc_peak)
|
236 |
+
processed_loc_peak = np.array(processed_loc_peak)
|
237 |
+
Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
|
238 |
+
Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
|
239 |
+
clean_energy[:num_crit-1])
|
240 |
+
W_clean = Wmax_clean * Wlocmax_clean
|
241 |
+
Wmax_processed = Kmax / (Kmax + dBMax_processed - \
|
242 |
+
processed_energy[:num_crit-1])
|
243 |
+
Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
|
244 |
+
processed_energy[:num_crit-1])
|
245 |
+
W_processed = Wmax_processed * Wlocmax_processed
|
246 |
+
W = (W_clean + W_processed) / 2
|
247 |
+
distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
|
248 |
+
processed_slope[:num_crit - 1]) ** 2))
|
249 |
+
|
250 |
+
# this normalization is not part of Klatt's paper, but helps
|
251 |
+
# to normalize the meaasure. Here we scale the measure by the sum of the
|
252 |
+
# weights
|
253 |
+
distortion[frame_count] = distortion[frame_count] / np.sum(W)
|
254 |
+
start += int(skiprate)
|
255 |
+
return distortion
|
256 |
+
|
257 |
+
|
258 |
+
def llr(ref_wav, deg_wav, srate):
|
259 |
+
clean_speech = ref_wav
|
260 |
+
processed_speech = deg_wav
|
261 |
+
clean_length = ref_wav.shape[0]
|
262 |
+
processed_length = deg_wav.shape[0]
|
263 |
+
assert clean_length == processed_length, clean_length
|
264 |
+
|
265 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
266 |
+
skiprate = np.floor(winlength / 4)
|
267 |
+
if srate < 10000:
|
268 |
+
# LPC analysis order
|
269 |
+
P = 10
|
270 |
+
else:
|
271 |
+
P = 16
|
272 |
+
|
273 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
274 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
275 |
+
start = 0
|
276 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
277 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
278 |
+
distortion = []
|
279 |
+
|
280 |
+
for frame_count in range(num_frames):
|
281 |
+
# (1) Get the Frames for the test and reference speeech.
|
282 |
+
# Multiply by Hanning window.
|
283 |
+
clean_frame = clean_speech[start:start+winlength]
|
284 |
+
processed_frame = processed_speech[start:start+winlength]
|
285 |
+
clean_frame = clean_frame * window
|
286 |
+
processed_frame = processed_frame * window
|
287 |
+
|
288 |
+
# (2) Get the autocorrelation logs and LPC params used
|
289 |
+
# to compute the LLR measure
|
290 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
291 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
292 |
+
A_clean = A_clean[None, :]
|
293 |
+
A_processed = A_processed[None, :]
|
294 |
+
|
295 |
+
# (3) Compute the LLR measure
|
296 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
297 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
298 |
+
|
299 |
+
if (numerator/denominator) <= 0:
|
300 |
+
print(f'Numerator: {numerator}')
|
301 |
+
print(f'Denominator: {denominator}')
|
302 |
+
|
303 |
+
log_ = np.log(numerator / denominator)
|
304 |
+
distortion.append(np.squeeze(log_))
|
305 |
+
start += int(skiprate)
|
306 |
+
return np.nan_to_num(np.array(distortion))
|
307 |
+
# -------------------------------------------------------------------------- #
|
scores/helper_bk.py
ADDED
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modifications in Metrics
|
3 |
+
|
4 |
+
# Original copyright:
|
5 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
6 |
+
# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
|
7 |
+
"""
|
8 |
+
import numpy as np
|
9 |
+
from scipy.linalg import toeplitz
|
10 |
+
|
11 |
+
# ----------------------------- HELPERS ------------------------------------ #
|
12 |
+
def trim_mos(val):
|
13 |
+
return min(max(val, 1), 5)
|
14 |
+
|
15 |
+
def lpcoeff(speech_frame, model_order):
|
16 |
+
# (1) Compute Autocor lags
|
17 |
+
winlength = speech_frame.shape[0]
|
18 |
+
R = []
|
19 |
+
for k in range(model_order + 1):
|
20 |
+
first = speech_frame[:(winlength - k)]
|
21 |
+
second = speech_frame[k:winlength]
|
22 |
+
R.append(np.sum(first * second))
|
23 |
+
|
24 |
+
# (2) Lev-Durbin
|
25 |
+
a = np.ones((model_order,))
|
26 |
+
E = np.zeros((model_order + 1,))
|
27 |
+
rcoeff = np.zeros((model_order,))
|
28 |
+
E[0] = R[0]
|
29 |
+
for i in range(model_order):
|
30 |
+
if i == 0:
|
31 |
+
sum_term = 0
|
32 |
+
else:
|
33 |
+
a_past = a[:i]
|
34 |
+
sum_term = np.sum(a_past * np.array(R[i:0:-1]))
|
35 |
+
rcoeff[i] = (R[i+1] - sum_term)/E[i]
|
36 |
+
a[i] = rcoeff[i]
|
37 |
+
if i > 0:
|
38 |
+
a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
|
39 |
+
E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
|
40 |
+
acorr = np.array(R, dtype=np.float32)
|
41 |
+
refcoeff = np.array(rcoeff, dtype=np.float32)
|
42 |
+
a = a * -1
|
43 |
+
lpparams = np.array([1] + list(a), dtype=np.float32)
|
44 |
+
acorr = np.array(acorr, dtype=np.float32)
|
45 |
+
refcoeff = np.array(refcoeff, dtype=np.float32)
|
46 |
+
lpparams = np.array(lpparams, dtype=np.float32)
|
47 |
+
|
48 |
+
return acorr, refcoeff, lpparams
|
49 |
+
# -------------------------------------------------------------------------- #
|
50 |
+
|
51 |
+
|
52 |
+
def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
|
53 |
+
""" Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
54 |
+
This function implements the segmental signal-to-noise ratio
|
55 |
+
as defined in [1, p. 45] (see Equation 2.12).
|
56 |
+
"""
|
57 |
+
clean_speech = ref_wav
|
58 |
+
processed_speech = deg_wav
|
59 |
+
clean_length = ref_wav.shape[0]
|
60 |
+
processed_length = deg_wav.shape[0]
|
61 |
+
|
62 |
+
# scale both to have same dynamic range. Remove DC too.
|
63 |
+
clean_speech -= clean_speech.mean()
|
64 |
+
processed_speech -= processed_speech.mean()
|
65 |
+
processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
|
66 |
+
|
67 |
+
# Signal-to-Noise Ratio
|
68 |
+
dif = ref_wav - deg_wav
|
69 |
+
overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
|
70 |
+
10e-20))
|
71 |
+
# global variables
|
72 |
+
winlength = int(np.round(30 * srate / 1000)) # 30 msecs
|
73 |
+
skiprate = winlength // 4
|
74 |
+
MIN_SNR = -10
|
75 |
+
MAX_SNR = 35
|
76 |
+
|
77 |
+
# For each frame, calculate SSNR
|
78 |
+
num_frames = int(clean_length / skiprate - (winlength/skiprate))
|
79 |
+
start = 0
|
80 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
81 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
82 |
+
segmental_snr = []
|
83 |
+
|
84 |
+
for frame_count in range(int(num_frames)):
|
85 |
+
# (1) get the frames for the test and ref speech.
|
86 |
+
# Apply Hanning Window
|
87 |
+
clean_frame = clean_speech[start:start+winlength]
|
88 |
+
processed_frame = processed_speech[start:start+winlength]
|
89 |
+
clean_frame = clean_frame * window
|
90 |
+
processed_frame = processed_frame * window
|
91 |
+
|
92 |
+
# (2) Compute Segmental SNR
|
93 |
+
signal_energy = np.sum(clean_frame ** 2)
|
94 |
+
noise_energy = np.sum((clean_frame - processed_frame) ** 2)
|
95 |
+
segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
|
96 |
+
segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
|
97 |
+
segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
|
98 |
+
start += int(skiprate)
|
99 |
+
return overall_snr, segmental_snr
|
100 |
+
|
101 |
+
|
102 |
+
def wss(ref_wav, deg_wav, srate):
|
103 |
+
clean_speech = ref_wav
|
104 |
+
processed_speech = deg_wav
|
105 |
+
clean_length = ref_wav.shape[0]
|
106 |
+
processed_length = deg_wav.shape[0]
|
107 |
+
|
108 |
+
assert clean_length == processed_length, clean_length
|
109 |
+
|
110 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
111 |
+
skiprate = np.floor(winlength / 4)
|
112 |
+
max_freq = srate / 2
|
113 |
+
num_crit = 25 # num of critical bands
|
114 |
+
|
115 |
+
USE_FFT_SPECTRUM = 1
|
116 |
+
n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
|
117 |
+
n_fftby2 = int(n_fft / 2)
|
118 |
+
Kmax = 20
|
119 |
+
Klocmax = 1
|
120 |
+
|
121 |
+
# Critical band filter definitions (Center frequency and BW in Hz)
|
122 |
+
cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
|
123 |
+
703.378, 798.717, 904.128, 1020.38, 1148.30,
|
124 |
+
1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
|
125 |
+
2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
|
126 |
+
3597.63]
|
127 |
+
bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
|
128 |
+
95.3398, 105.411, 116.256, 127.914, 140.423,
|
129 |
+
153.823, 168.154, 183.457, 199.776, 217.153,
|
130 |
+
235.631, 255.255, 276.072, 298.126, 321.465,
|
131 |
+
346.136]
|
132 |
+
|
133 |
+
bw_min = bandwidth[0] # min critical bandwidth
|
134 |
+
|
135 |
+
# set up critical band filters. Note here that Gaussianly shaped filters
|
136 |
+
# are used. Also, the sum of the filter weights are equivalent for each
|
137 |
+
# critical band filter. Filter less than -30 dB and set to zero.
|
138 |
+
min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
|
139 |
+
|
140 |
+
crit_filter = np.zeros((num_crit, n_fftby2))
|
141 |
+
all_f0 = []
|
142 |
+
for i in range(num_crit):
|
143 |
+
f0 = (cent_freq[i] / max_freq) * (n_fftby2)
|
144 |
+
all_f0.append(np.floor(f0))
|
145 |
+
bw = (bandwidth[i] / max_freq) * (n_fftby2)
|
146 |
+
norm_factor = np.log(bw_min) - np.log(bandwidth[i])
|
147 |
+
j = list(range(n_fftby2))
|
148 |
+
crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
|
149 |
+
norm_factor)
|
150 |
+
crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
|
151 |
+
min_factor)
|
152 |
+
|
153 |
+
# For each frame of input speech, compute Weighted Spectral Slope Measure
|
154 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
155 |
+
start = 0 # starting sample
|
156 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
157 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
158 |
+
distortion = []
|
159 |
+
|
160 |
+
for frame_count in range(num_frames):
|
161 |
+
# (1) Get the Frames for the test and reference speeech.
|
162 |
+
# Multiply by Hanning window.
|
163 |
+
clean_frame = clean_speech[start:start+winlength]
|
164 |
+
processed_frame = processed_speech[start:start+winlength]
|
165 |
+
clean_frame = clean_frame * window
|
166 |
+
processed_frame = processed_frame * window
|
167 |
+
|
168 |
+
# (2) Compuet Power Spectrum of clean and processed
|
169 |
+
clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
|
170 |
+
processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
|
171 |
+
clean_energy = [None] * num_crit
|
172 |
+
processed_energy = [None] * num_crit
|
173 |
+
|
174 |
+
# (3) Compute Filterbank output energies (in dB)
|
175 |
+
for i in range(num_crit):
|
176 |
+
clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
|
177 |
+
crit_filter[i, :])
|
178 |
+
processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
|
179 |
+
crit_filter[i, :])
|
180 |
+
clean_energy = np.array(clean_energy).reshape(-1, 1)
|
181 |
+
eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
|
182 |
+
clean_energy = np.concatenate((clean_energy, eps), axis=1)
|
183 |
+
clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
|
184 |
+
processed_energy = np.array(processed_energy).reshape(-1, 1)
|
185 |
+
processed_energy = np.concatenate((processed_energy, eps), axis=1)
|
186 |
+
processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
|
187 |
+
|
188 |
+
# (4) Compute Spectral Shape (dB[i+1] - dB[i])
|
189 |
+
clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
|
190 |
+
processed_slope = processed_energy[1:num_crit] - \
|
191 |
+
processed_energy[:num_crit-1]
|
192 |
+
|
193 |
+
# (5) Find the nearest peak locations in the spectra to each
|
194 |
+
# critical band. If the slope is negative, we search
|
195 |
+
# to the left. If positive, we search to the right.
|
196 |
+
clean_loc_peak = []
|
197 |
+
processed_loc_peak = []
|
198 |
+
for i in range(num_crit - 1):
|
199 |
+
if clean_slope[i] > 0:
|
200 |
+
# search to the right
|
201 |
+
n = i
|
202 |
+
while n < num_crit - 1 and clean_slope[n] > 0:
|
203 |
+
n += 1
|
204 |
+
clean_loc_peak.append(clean_energy[n - 1])
|
205 |
+
else:
|
206 |
+
# search to the left
|
207 |
+
n = i
|
208 |
+
while n >= 0 and clean_slope[n] <= 0:
|
209 |
+
n -= 1
|
210 |
+
clean_loc_peak.append(clean_energy[n + 1])
|
211 |
+
# find the peaks in the processed speech signal
|
212 |
+
if processed_slope[i] > 0:
|
213 |
+
n = i
|
214 |
+
while n < num_crit - 1 and processed_slope[n] > 0:
|
215 |
+
n += 1
|
216 |
+
processed_loc_peak.append(processed_energy[n - 1])
|
217 |
+
else:
|
218 |
+
n = i
|
219 |
+
while n >= 0 and processed_slope[n] <= 0:
|
220 |
+
n -= 1
|
221 |
+
processed_loc_peak.append(processed_energy[n + 1])
|
222 |
+
|
223 |
+
# (6) Compuet the WSS Measure for this frame. This includes
|
224 |
+
# determination of the weighting functino
|
225 |
+
dBMax_clean = max(clean_energy)
|
226 |
+
dBMax_processed = max(processed_energy)
|
227 |
+
|
228 |
+
# The weights are calculated by averaging individual
|
229 |
+
# weighting factors from the clean and processed frame.
|
230 |
+
# These weights W_clean and W_processed should range
|
231 |
+
# from 0 to 1 and place more emphasis on spectral
|
232 |
+
# peaks and less emphasis on slope differences in spectral
|
233 |
+
# valleys. This procedure is described on page 1280 of
|
234 |
+
# Klatt's 1982 ICASSP paper.
|
235 |
+
clean_loc_peak = np.array(clean_loc_peak)
|
236 |
+
processed_loc_peak = np.array(processed_loc_peak)
|
237 |
+
Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
|
238 |
+
Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
|
239 |
+
clean_energy[:num_crit-1])
|
240 |
+
W_clean = Wmax_clean * Wlocmax_clean
|
241 |
+
Wmax_processed = Kmax / (Kmax + dBMax_processed - \
|
242 |
+
processed_energy[:num_crit-1])
|
243 |
+
Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
|
244 |
+
processed_energy[:num_crit-1])
|
245 |
+
W_processed = Wmax_processed * Wlocmax_processed
|
246 |
+
W = (W_clean + W_processed) / 2
|
247 |
+
distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
|
248 |
+
processed_slope[:num_crit - 1]) ** 2))
|
249 |
+
|
250 |
+
# this normalization is not part of Klatt's paper, but helps
|
251 |
+
# to normalize the meaasure. Here we scale the measure by the sum of the
|
252 |
+
# weights
|
253 |
+
distortion[frame_count] = distortion[frame_count] / np.sum(W)
|
254 |
+
start += int(skiprate)
|
255 |
+
return distortion
|
256 |
+
|
257 |
+
|
258 |
+
def llr(ref_wav, deg_wav, srate):
|
259 |
+
clean_speech = ref_wav
|
260 |
+
processed_speech = deg_wav
|
261 |
+
clean_length = ref_wav.shape[0]
|
262 |
+
processed_length = deg_wav.shape[0]
|
263 |
+
assert clean_length == processed_length, clean_length
|
264 |
+
|
265 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
266 |
+
skiprate = np.floor(winlength / 4)
|
267 |
+
if srate < 10000:
|
268 |
+
# LPC analysis order
|
269 |
+
P = 10
|
270 |
+
else:
|
271 |
+
P = 16
|
272 |
+
|
273 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
274 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
275 |
+
start = 0
|
276 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
277 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
278 |
+
distortion = []
|
279 |
+
|
280 |
+
for frame_count in range(num_frames):
|
281 |
+
# (1) Get the Frames for the test and reference speeech.
|
282 |
+
# Multiply by Hanning window.
|
283 |
+
clean_frame = clean_speech[start:start+winlength]
|
284 |
+
processed_frame = processed_speech[start:start+winlength]
|
285 |
+
clean_frame = clean_frame * window
|
286 |
+
processed_frame = processed_frame * window
|
287 |
+
|
288 |
+
# (2) Get the autocorrelation logs and LPC params used
|
289 |
+
# to compute the LLR measure
|
290 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
291 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
292 |
+
A_clean = A_clean[None, :]
|
293 |
+
A_processed = A_processed[None, :]
|
294 |
+
|
295 |
+
# (3) Compute the LLR measure
|
296 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
297 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
298 |
+
|
299 |
+
if (numerator/denominator) <= 0:
|
300 |
+
print(f'Numerator: {numerator}')
|
301 |
+
print(f'Denominator: {denominator}')
|
302 |
+
|
303 |
+
log_ = np.log(numerator / denominator)
|
304 |
+
distortion.append(np.squeeze(log_))
|
305 |
+
start += int(skiprate)
|
306 |
+
return np.nan_to_num(np.array(distortion))
|
307 |
+
# -------------------------------------------------------------------------- #
|
308 |
+
|
309 |
+
#!/usr/bin/env python3
|
310 |
+
|
311 |
+
# Copyright 2020 Wen-Chin Huang and Tomoki Hayashi
|
312 |
+
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
313 |
+
# ported from https://github.com/espnet/espnet/blob/master/utils/mcd_calculate.py
|
314 |
+
|
315 |
+
"""Evaluate MCD between generated and groundtruth audios with SPTK-based mcep."""
|
316 |
+
|
317 |
+
from typing import Tuple
|
318 |
+
|
319 |
+
import numpy as np
|
320 |
+
import pysptk
|
321 |
+
from fastdtw import fastdtw
|
322 |
+
from scipy import spatial
|
323 |
+
|
324 |
+
|
325 |
+
def sptk_extract(
|
326 |
+
x: np.ndarray,
|
327 |
+
fs: int,
|
328 |
+
n_fft: int = 512,
|
329 |
+
n_shift: int = 256,
|
330 |
+
mcep_dim: int = 25,
|
331 |
+
mcep_alpha: float = 0.41,
|
332 |
+
is_padding: bool = False,
|
333 |
+
) -> np.ndarray:
|
334 |
+
"""Extract SPTK-based mel-cepstrum.
|
335 |
+
|
336 |
+
Args:
|
337 |
+
x (ndarray): 1D waveform array.
|
338 |
+
fs (int): Sampling rate
|
339 |
+
n_fft (int): FFT length in point (default=512).
|
340 |
+
n_shift (int): Shift length in point (default=256).
|
341 |
+
mcep_dim (int): Dimension of mel-cepstrum (default=25).
|
342 |
+
mcep_alpha (float): All pass filter coefficient (default=0.41).
|
343 |
+
is_padding (bool): Whether to pad the end of signal (default=False).
|
344 |
+
|
345 |
+
Returns:
|
346 |
+
ndarray: Mel-cepstrum with the size (N, n_fft).
|
347 |
+
|
348 |
+
"""
|
349 |
+
# perform padding
|
350 |
+
if is_padding:
|
351 |
+
n_pad = n_fft - (len(x) - n_fft) % n_shift
|
352 |
+
x = np.pad(x, (0, n_pad), "reflect")
|
353 |
+
|
354 |
+
# get number of frames
|
355 |
+
n_frame = (len(x) - n_fft) // n_shift + 1
|
356 |
+
|
357 |
+
# get window function
|
358 |
+
win = pysptk.sptk.hamming(n_fft)
|
359 |
+
|
360 |
+
# check mcep and alpha
|
361 |
+
if mcep_dim is None or mcep_alpha is None:
|
362 |
+
mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
|
363 |
+
|
364 |
+
# calculate spectrogram
|
365 |
+
mcep = [
|
366 |
+
pysptk.mcep(
|
367 |
+
x[n_shift * i : n_shift * i + n_fft] * win,
|
368 |
+
mcep_dim,
|
369 |
+
mcep_alpha,
|
370 |
+
eps=1e-6,
|
371 |
+
etype=1,
|
372 |
+
)
|
373 |
+
for i in range(n_frame)
|
374 |
+
]
|
375 |
+
|
376 |
+
return np.stack(mcep)
|
377 |
+
|
378 |
+
|
379 |
+
def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
|
380 |
+
# https://sp-nitech.github.io/sptk/latest/main/mgcep.html#_CPPv4N4sptk19MelCepstralAnalysisE
|
381 |
+
if fs == 8000:
|
382 |
+
return 13, 0.31
|
383 |
+
elif fs == 16000:
|
384 |
+
return 23, 0.42
|
385 |
+
elif fs == 22050:
|
386 |
+
return 34, 0.45
|
387 |
+
elif fs == 24000:
|
388 |
+
return 34, 0.46
|
389 |
+
elif fs == 32000:
|
390 |
+
return 36, 0.50
|
391 |
+
elif fs == 44100:
|
392 |
+
return 39, 0.53
|
393 |
+
elif fs == 48000:
|
394 |
+
return 39, 0.55
|
395 |
+
else:
|
396 |
+
raise ValueError(f"Not found the setting for {fs}.")
|
397 |
+
|
398 |
+
|
399 |
+
def calculate_mcd(
|
400 |
+
inf_audio,
|
401 |
+
ref_audio,
|
402 |
+
fs,
|
403 |
+
n_fft=1024,
|
404 |
+
n_shift=256,
|
405 |
+
mcep_dim=None,
|
406 |
+
mcep_alpha=None,
|
407 |
+
):
|
408 |
+
"""Calculate MCD."""
|
409 |
+
|
410 |
+
# extract ground truth and converted features
|
411 |
+
gen_mcep = sptk_extract(
|
412 |
+
x=inf_audio,
|
413 |
+
fs=fs,
|
414 |
+
n_fft=n_fft,
|
415 |
+
n_shift=n_shift,
|
416 |
+
mcep_dim=mcep_dim,
|
417 |
+
mcep_alpha=mcep_alpha,
|
418 |
+
)
|
419 |
+
gt_mcep = sptk_extract(
|
420 |
+
x=ref_audio,
|
421 |
+
fs=fs,
|
422 |
+
n_fft=n_fft,
|
423 |
+
n_shift=n_shift,
|
424 |
+
mcep_dim=mcep_dim,
|
425 |
+
mcep_alpha=mcep_alpha,
|
426 |
+
)
|
427 |
+
|
428 |
+
# DTW
|
429 |
+
_, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
|
430 |
+
twf = np.array(path).T
|
431 |
+
gen_mcep_dtw = gen_mcep[twf[0]]
|
432 |
+
gt_mcep_dtw = gt_mcep[twf[1]]
|
433 |
+
|
434 |
+
# MCD
|
435 |
+
diff2sum = np.sum((gen_mcep_dtw - gt_mcep_dtw) ** 2, 1)
|
436 |
+
mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
|
437 |
+
|
438 |
+
return mcd
|
scores/llr.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import numpy as np
|
3 |
+
from scipy.linalg import toeplitz
|
4 |
+
from scores.helper import lpcoeff
|
5 |
+
|
6 |
+
class LLR(ScoreBasis):
|
7 |
+
def __init__(self):
|
8 |
+
super(LLR, self).__init__(name='LLR')
|
9 |
+
self.intrusive = False
|
10 |
+
|
11 |
+
def windowed_scoring(self, audios, score_rate):
|
12 |
+
if len(audios) != 2:
|
13 |
+
raise ValueError('LLR needs a reference and a test signals.')
|
14 |
+
return cal_LLR(audios[0], audios[1], score_rate)
|
15 |
+
|
16 |
+
def cal_LLR(ref_wav, deg_wav, srate):
|
17 |
+
# obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
|
18 |
+
clean_speech = ref_wav
|
19 |
+
processed_speech = deg_wav
|
20 |
+
clean_length = ref_wav.shape[0]
|
21 |
+
processed_length = deg_wav.shape[0]
|
22 |
+
assert clean_length == processed_length, clean_length
|
23 |
+
|
24 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
25 |
+
skiprate = np.floor(winlength / 4)
|
26 |
+
if srate < 10000:
|
27 |
+
# LPC analysis order
|
28 |
+
P = 10
|
29 |
+
else:
|
30 |
+
P = 16
|
31 |
+
|
32 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
33 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
34 |
+
start = 0
|
35 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
36 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
37 |
+
distortion = []
|
38 |
+
|
39 |
+
for frame_count in range(num_frames):
|
40 |
+
# (1) Get the Frames for the test and reference speeech.
|
41 |
+
# Multiply by Hanning window.
|
42 |
+
clean_frame = clean_speech[start:start+winlength]
|
43 |
+
processed_frame = processed_speech[start:start+winlength]
|
44 |
+
clean_frame = clean_frame * window
|
45 |
+
processed_frame = processed_frame * window
|
46 |
+
|
47 |
+
# (2) Get the autocorrelation logs and LPC params used
|
48 |
+
# to compute the LLR measure
|
49 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
50 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
51 |
+
A_clean = A_clean[None, :]
|
52 |
+
A_processed = A_processed[None, :]
|
53 |
+
|
54 |
+
# (3) Compute the LLR measure
|
55 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
56 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
57 |
+
|
58 |
+
if (numerator/denominator) <= 0:
|
59 |
+
print(f'Numerator: {numerator}')
|
60 |
+
print(f'Denominator: {denominator}')
|
61 |
+
|
62 |
+
log_ = np.log(numerator / denominator)
|
63 |
+
distortion.append(np.squeeze(log_))
|
64 |
+
start += int(skiprate)
|
65 |
+
return np.mean(np.nan_to_num(np.array(distortion)))
|
66 |
+
|
scores/lsd.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
EPS = 1e-12
|
6 |
+
|
7 |
+
class LSD(ScoreBasis):
|
8 |
+
def __init__(self):
|
9 |
+
super(LSD, self).__init__(name='LSD')
|
10 |
+
self.intrusive = False
|
11 |
+
self.mono = True
|
12 |
+
|
13 |
+
def windowed_scoring(self, audios, score_rate):
|
14 |
+
if len(audios) != 2:
|
15 |
+
raise ValueError('NB_PESQ needs a reference and a test signals.')
|
16 |
+
est = wav_to_spectrogram(audios[1], score_rate)
|
17 |
+
target = wav_to_spectrogram(audios[0], score_rate)
|
18 |
+
return cal_LSD(est, target)
|
19 |
+
|
20 |
+
def wav_to_spectrogram(wav, rate):
|
21 |
+
hop_length = int(rate / 100)
|
22 |
+
n_fft = int(2048 / (48000 / rate))
|
23 |
+
spec = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))
|
24 |
+
spec = np.transpose(spec, (1, 0))
|
25 |
+
return spec
|
26 |
+
|
27 |
+
def cal_LSD(est, target):
|
28 |
+
log_ratio = np.log10(target**2 / ((est + EPS) ** 2) + EPS) ** 2
|
29 |
+
lsd_ = np.mean(np.mean(log_ratio, axis=1) ** 0.5, axis=0)
|
30 |
+
return lsd_
|
scores/mcd.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from basis import ScoreBasis
|
2 |
+
import librosa
|
3 |
+
import math
|
4 |
+
import numpy as np
|
5 |
+
import pyworld
|
6 |
+
import pysptk
|
7 |
+
from fastdtw import fastdtw
|
8 |
+
from scipy.spatial.distance import euclidean
|
9 |
+
#from scores.helper import calculate_mcd
|
10 |
+
#from pymcd.mcd import Calculate_MCD
|
11 |
+
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
|
12 |
+
class MCD(ScoreBasis):
|
13 |
+
def __init__(self):
|
14 |
+
super(MCD, self).__init__(name='MCD')
|
15 |
+
self.intrusive = False
|
16 |
+
# three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
|
17 |
+
self.mcd_toolbox = Calculate_MCD(MCD_mode="plain")
|
18 |
+
|
19 |
+
def windowed_scoring(self, audios, score_rate):
|
20 |
+
if len(audios) != 2:
|
21 |
+
raise ValueError('MCD needs a reference and a test signals.')
|
22 |
+
return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate)
|
23 |
+
|
24 |
+
# ================================================= #
|
25 |
+
# calculate the Mel-Cepstral Distortion (MCD) value #
|
26 |
+
# ================================================= #
|
27 |
+
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
|
28 |
+
class Calculate_MCD(object):
|
29 |
+
"""docstring for Calculate_MCD"""
|
30 |
+
def __init__(self, MCD_mode):
|
31 |
+
super(Calculate_MCD, self).__init__()
|
32 |
+
self.MCD_mode = MCD_mode
|
33 |
+
#self.SAMPLING_RATE = 22050
|
34 |
+
self.FRAME_PERIOD = 5.0
|
35 |
+
self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
|
36 |
+
|
37 |
+
def load_wav(self, wav_file, sample_rate):
|
38 |
+
"""
|
39 |
+
Load a wav file with librosa.
|
40 |
+
:param wav_file: path to wav file
|
41 |
+
:param sr: sampling rate
|
42 |
+
:return: audio time series numpy array
|
43 |
+
"""
|
44 |
+
wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True)
|
45 |
+
return wav
|
46 |
+
|
47 |
+
# distance metric
|
48 |
+
def log_spec_dB_dist(self, x, y):
|
49 |
+
# log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
|
50 |
+
diff = x - y
|
51 |
+
return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
|
52 |
+
|
53 |
+
# calculate distance (metric)
|
54 |
+
# def calculate_mcd_distance(self, x, y, distance, path):
|
55 |
+
def calculate_mcd_distance(self, x, y, path):
|
56 |
+
'''
|
57 |
+
param path: pairs between x and y
|
58 |
+
'''
|
59 |
+
pathx = list(map(lambda l: l[0], path))
|
60 |
+
pathy = list(map(lambda l: l[1], path))
|
61 |
+
x, y = x[pathx], y[pathy]
|
62 |
+
frames_tot = x.shape[0] # length of pairs
|
63 |
+
|
64 |
+
z = x - y
|
65 |
+
min_cost_tot = np.sqrt((z * z).sum(-1)).sum()
|
66 |
+
|
67 |
+
return frames_tot, min_cost_tot
|
68 |
+
|
69 |
+
# extract acoustic features
|
70 |
+
# alpha = 0.65 # commonly used at 22050 Hz
|
71 |
+
def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512):
|
72 |
+
|
73 |
+
# Use WORLD vocoder to spectral envelope
|
74 |
+
_, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate,
|
75 |
+
frame_period=self.FRAME_PERIOD, fft_size=fft_size)
|
76 |
+
# Extract MCEP features
|
77 |
+
mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0,
|
78 |
+
etype=1, eps=1.0E-8, min_det=0.0, itype=3)
|
79 |
+
|
80 |
+
return mcep
|
81 |
+
|
82 |
+
# calculate the Mel-Cepstral Distortion (MCD) value
|
83 |
+
#def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode):
|
84 |
+
def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate):
|
85 |
+
"""
|
86 |
+
Calculate the average MCD.
|
87 |
+
:param ref_mcep_files: list of strings, paths to MCEP target reference files
|
88 |
+
:param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
|
89 |
+
:param cost_function: distance metric used
|
90 |
+
:param plain: if plain=True, use Dynamic Time Warping (dtw)
|
91 |
+
:returns: average MCD, total frames processed
|
92 |
+
"""
|
93 |
+
# load wav from given wav file
|
94 |
+
#loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE)
|
95 |
+
#loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE)
|
96 |
+
|
97 |
+
if MCD_mode == "plain":
|
98 |
+
# pad 0
|
99 |
+
if len(loaded_ref_wav)<len(loaded_syn_wav):
|
100 |
+
loaded_ref_wav = np.pad(loaded_ref_wav, (0, len(loaded_syn_wav)-len(loaded_ref_wav)))
|
101 |
+
else:
|
102 |
+
loaded_syn_wav = np.pad(loaded_syn_wav, (0, len(loaded_ref_wav)-len(loaded_syn_wav)))
|
103 |
+
|
104 |
+
# extract MCEP features (vectors): 2D matrix (num x mcep_size)
|
105 |
+
ref_mcep_vec = self.wav2mcep_numpy(loaded_ref_wav, score_rate)
|
106 |
+
syn_mcep_vec = self.wav2mcep_numpy(loaded_syn_wav, score_rate)
|
107 |
+
|
108 |
+
if MCD_mode == "plain":
|
109 |
+
# print("Calculate plain MCD ...")
|
110 |
+
path = []
|
111 |
+
# for i in range(num_temp):
|
112 |
+
for i in range(len(ref_mcep_vec)):
|
113 |
+
path.append((i, i))
|
114 |
+
elif MCD_mode == "dtw":
|
115 |
+
# print("Calculate MCD-dtw ...")
|
116 |
+
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
|
117 |
+
elif MCD_mode == "dtw_sl":
|
118 |
+
# print("Calculate MCD-dtw-sl ...")
|
119 |
+
cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
|
120 |
+
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
|
121 |
+
|
122 |
+
frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)
|
123 |
+
|
124 |
+
if MCD_mode == "dtw_sl":
|
125 |
+
mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot
|
126 |
+
else:
|
127 |
+
mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot
|
128 |
+
|
129 |
+
return mean_mcd
|
130 |
+
|
131 |
+
# calculate mcd
|
132 |
+
def calculate_mcd(self, reference_audio, synthesized_audio, score_rate):
|
133 |
+
# extract acoustic features
|
134 |
+
mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate)
|
135 |
+
|
136 |
+
return mean_mcd
|
scores/mosnet/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def load(window, hop=None):
|
2 |
+
import tensorflow as tf
|
3 |
+
from .model import MOSNet
|
4 |
+
tf.debugging.set_log_device_placement(False)
|
5 |
+
# set memory growth
|
6 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
7 |
+
if gpus:
|
8 |
+
try:
|
9 |
+
# Currently, memory growth needs to be the same across GPUs
|
10 |
+
for gpu in gpus:
|
11 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
12 |
+
|
13 |
+
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
|
14 |
+
print(len(gpus), "Physical GPUs,",
|
15 |
+
len(logical_gpus), "Logical GPUs")
|
16 |
+
except RuntimeError as e:
|
17 |
+
# Memory growth must be set before GPUs have been initialized
|
18 |
+
print(e)
|
19 |
+
|
20 |
+
mosnet = MOSNet(window, hop)
|
21 |
+
return mosnet
|
scores/mosnet/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (789 Bytes). View file
|
|
scores/mosnet/cnn_blstm.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78b75e7d76ee6074ea7d57dcffa56d0c90be9d3d8dedc2217e25e259423cb756
|
3 |
+
size 14248464
|