alibabasglab commited on
Commit
936f6fa
·
verified ·
1 Parent(s): f590a26

Upload 73 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __init__.py +169 -0
  2. __pycache__/basis.cpython-38.pyc +0 -0
  3. __pycache__/metric_loader.cpython-38.pyc +0 -0
  4. __pycache__/metrics.cpython-38.pyc +0 -0
  5. __pycache__/speechscore.cpython-38.pyc +0 -0
  6. audios/clean/audio_1.wav +0 -0
  7. audios/clean/audio_2.wav +0 -0
  8. audios/noisy/audio_1.wav +0 -0
  9. audios/noisy/audio_2.wav +0 -0
  10. audios/ref.wav +0 -0
  11. audios/test.wav +0 -0
  12. basis.py +113 -0
  13. demo.py +29 -0
  14. requirement.txt +5 -0
  15. scores/__init__.py +0 -0
  16. scores/__pycache__/__init__.cpython-38.pyc +0 -0
  17. scores/__pycache__/bsseval.cpython-38.pyc +0 -0
  18. scores/__pycache__/cbak.cpython-38.pyc +0 -0
  19. scores/__pycache__/covl.cpython-38.pyc +0 -0
  20. scores/__pycache__/csig.cpython-38.pyc +0 -0
  21. scores/__pycache__/fwsegsnr.cpython-38.pyc +0 -0
  22. scores/__pycache__/helper.cpython-38.pyc +0 -0
  23. scores/__pycache__/llr.cpython-38.pyc +0 -0
  24. scores/__pycache__/lsd.cpython-38.pyc +0 -0
  25. scores/__pycache__/mcd.cpython-38.pyc +0 -0
  26. scores/__pycache__/nb_pesq.cpython-38.pyc +0 -0
  27. scores/__pycache__/pesq.cpython-38.pyc +0 -0
  28. scores/__pycache__/sisdr.cpython-38.pyc +0 -0
  29. scores/__pycache__/snr.cpython-38.pyc +0 -0
  30. scores/__pycache__/ssnr.cpython-38.pyc +0 -0
  31. scores/__pycache__/stoi.cpython-38.pyc +0 -0
  32. scores/bsseval.py +21 -0
  33. scores/cbak.py +37 -0
  34. scores/covl.py +39 -0
  35. scores/csig.py +38 -0
  36. scores/dnsmos/DNSMOS/bak_ovr.onnx +3 -0
  37. scores/dnsmos/DNSMOS/model_v8.onnx +3 -0
  38. scores/dnsmos/DNSMOS/sig.onnx +3 -0
  39. scores/dnsmos/DNSMOS/sig_bak_ovr.onnx +3 -0
  40. scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc +0 -0
  41. scores/dnsmos/dnsmos.py +94 -0
  42. scores/fwsegsnr.py +49 -0
  43. scores/helper.py +307 -0
  44. scores/helper_bk.py +438 -0
  45. scores/llr.py +66 -0
  46. scores/lsd.py +30 -0
  47. scores/mcd.py +136 -0
  48. scores/mosnet/__init__.py +21 -0
  49. scores/mosnet/__pycache__/__init__.cpython-38.pyc +0 -0
  50. scores/mosnet/cnn_blstm.h5 +3 -0
__init__.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Metric:
2
+ def __init__(self, name, window, hop=None, verbose=False):
3
+ # the metric operates on some fixed rate only or only on mono ?
4
+ self.fixed_rate = None
5
+ self.mono = False
6
+
7
+ # is the metric absolute or relative ?
8
+ self.absolute = False
9
+
10
+ # length and hop of windows
11
+ self.window = window
12
+ if hop is None:
13
+ hop = window
14
+ self.hop = hop
15
+ self.name = name
16
+ self.verbose = verbose
17
+
18
+ def test_window(self, audios, rate):
19
+ raise NotImplementedError
20
+
21
+ def test(self, *test_files, array_rate=None):
22
+ """loading sound files and making sure they all have the same lengths
23
+ (zero-padding to the largest). Also works with numpy arrays.
24
+ Then, calling the `test_window` function that should be specialised
25
+ depending on the metric."""
26
+
27
+ # imports
28
+ import soundfile as sf
29
+ import resampy
30
+ from museval.metrics import Framing
31
+ import numpy as np
32
+
33
+ audios = []
34
+ maxlen = 0
35
+ if isinstance(test_files, str):
36
+ test_files = [test_files]
37
+ if self.absolute and len(test_files) > 1:
38
+ if self.verbose:
39
+ print(' [%s] is absolute. Processing first file only'
40
+ % self.name)
41
+ test_files = [test_files[0],]
42
+
43
+ for file in test_files:
44
+ # Loading sound file
45
+ if isinstance(file, str):
46
+ audio, rate = sf.read(file, always_2d=True)
47
+ else:
48
+ rate = array_rate
49
+ if rate is None:
50
+ raise ValueError('Sampling rate needs to be specified '
51
+ 'when feeding numpy arrays.')
52
+ audio = file
53
+ # Standardize shapes
54
+ if len(audio.shape) == 1:
55
+ audio = audio[:, None]
56
+ if len(audio.shape) != 2:
57
+ raise ValueError('Please provide 1D or 2D array, received '
58
+ '{}D array'.format(len(audio.shape)))
59
+
60
+ if self.fixed_rate is not None and rate != self.fixed_rate:
61
+ if self.verbose:
62
+ print(' [%s] preferred is %dkHz rate. resampling'
63
+ % (self.name, self.fixed_rate))
64
+ audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
65
+ rate = self.fixed_rate
66
+ if self.mono and audio.shape[1] > 1:
67
+ if self.verbose:
68
+ print(' [%s] only supports mono. Will use first channel'
69
+ % self.name)
70
+ audio = audio[..., 0, None]
71
+ if self.mono:
72
+ audio = audio[..., 0]
73
+ maxlen = max(maxlen, audio.shape[0])
74
+ audios += [audio]
75
+
76
+ for index, audio in enumerate(audios):
77
+ if audio.shape[0] != maxlen:
78
+ new = np.zeros((maxlen,) + audio.shape[1:])
79
+ new[:audio.shape[0]] = audio
80
+ audios[index] = new
81
+
82
+ if self.window is not None:
83
+ framer = Framing(self.window * rate,
84
+ self.hop * rate, maxlen)
85
+ nwin = framer.nwin
86
+ result = {}
87
+ for (t, win) in enumerate(framer):
88
+ result_t = self.test_window([audio[win] for audio in audios],
89
+ rate)
90
+ for metric in result_t.keys():
91
+ if metric not in result.keys():
92
+ result[metric] = np.empty(nwin)
93
+ result[metric][t] = result_t[metric]
94
+ else:
95
+ result = self.test_window(audios, rate)
96
+ return result
97
+
98
+
99
+ import absolute
100
+ import relative
101
+
102
+
103
+ class MetricsList:
104
+ def __init__(self):
105
+ self.metrics = []
106
+
107
+ def __add__(self, metric):
108
+ self.metrics += [metric]
109
+ return self
110
+
111
+ def __str__(self):
112
+ return 'Metrics: ' + ' '.join([x.name for x in self.metrics])
113
+
114
+ def __call__(self, *files, rate=None):
115
+ result = {}
116
+ for metric in self.metrics:
117
+ result_metric = metric.test(*files, array_rate=rate)
118
+ for name in result_metric.keys():
119
+ result[name] = result_metric[name]
120
+ return result
121
+
122
+
123
+ def load(metrics='', window=2, verbose=False):
124
+ """ Load the desired metrics inside a Metrics object that can then
125
+ be called to compute all the desired metrics.
126
+
127
+ Parameters:
128
+ ----------
129
+ metrics: str or list of str
130
+ the metrics matching any of these will be automatically loaded. this
131
+ match is relative to the structure of the speechmetrics package.
132
+ For instance:
133
+ * 'absolute' will match all absolute metrics
134
+ * 'absolute.srmr' or 'srmr' will only match SRMR
135
+ * '' will match all
136
+
137
+ window: float
138
+ the window length to use for testing the files.
139
+
140
+ verbose: boolean
141
+ will display information during computations
142
+
143
+ Returns:
144
+ --------
145
+
146
+ A MetricsList object, that can be run to get the desired metrics
147
+ """
148
+ import pkgutil
149
+ import importlib
150
+
151
+ result = MetricsList()
152
+
153
+ found_modules = []
154
+ iterator = pkgutil.walk_packages(__path__, __name__ + '.')
155
+
156
+ if isinstance(metrics, str):
157
+ metrics = [metrics]
158
+ for module_info in iterator:
159
+ if any([metric in module_info.name for metric in metrics]):
160
+ module = importlib.import_module(module_info.name)
161
+ if module not in found_modules:
162
+ found_modules += [module],
163
+ if hasattr(module, 'load'):
164
+ load_function = getattr(module, 'load')
165
+ new_metric = load_function(window)
166
+ new_metric.verbose = verbose
167
+ result += new_metric
168
+ print('Loaded ', module_info.name)
169
+ return result
__pycache__/basis.cpython-38.pyc ADDED
Binary file (1.57 kB). View file
 
__pycache__/metric_loader.cpython-38.pyc ADDED
Binary file (3.48 kB). View file
 
__pycache__/metrics.cpython-38.pyc ADDED
Binary file (2.72 kB). View file
 
__pycache__/speechscore.cpython-38.pyc ADDED
Binary file (5.95 kB). View file
 
audios/clean/audio_1.wav ADDED
Binary file (76.8 kB). View file
 
audios/clean/audio_2.wav ADDED
Binary file (76.8 kB). View file
 
audios/noisy/audio_1.wav ADDED
Binary file (76.8 kB). View file
 
audios/noisy/audio_2.wav ADDED
Binary file (76.8 kB). View file
 
audios/ref.wav ADDED
Binary file (76.8 kB). View file
 
audios/test.wav ADDED
Binary file (76.8 kB). View file
 
basis.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ScoreBasis:
2
+ def __init__(self, name=None):
3
+ # the score operates on the specified rate
4
+ self.score_rate = None
5
+ # is the score intrusive or non-intrusive ?
6
+ self.intrusive = True #require a reference
7
+ self.name = name
8
+
9
+ def windowed_scoring(self, audios, score_rate):
10
+ raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented')
11
+
12
+ def scoring(self, data, window=None, score_rate=None):
13
+ """ calling the `windowed_scoring` function that should be specialised
14
+ depending on the score."""
15
+
16
+ # imports
17
+ #import soundfile as sf
18
+ import resampy
19
+ from museval.metrics import Framing
20
+
21
+ #checking rate
22
+ audios = data['audio']
23
+ score_rate = data['rate']
24
+
25
+ if self.score_rate is not None:
26
+ score_rate = self.score_rate
27
+
28
+ if score_rate != data['rate']:
29
+ for index, audio in enumerate(audios):
30
+ audio = resampy.resample(audio, data['rate'], score_rate, axis=0)
31
+ audios[index] = audio
32
+
33
+ if window is not None:
34
+ framer = Framing(window * score_rate, window * score_rate, maxlen)
35
+ nwin = framer.nwin
36
+ result = {}
37
+ for (t, win) in enumerate(framer):
38
+ result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
39
+ result[t] = result_t
40
+ else:
41
+ result = self.windowed_scoring(audios, score_rate)
42
+ return result
43
+ """
44
+ audios = []
45
+ maxlen = 0
46
+ if isinstance(test_files, str):
47
+ test_files = [test_files]
48
+ print(f'test_files: {test_files}')
49
+ if not self.intrusive and len(test_files) > 1:
50
+ if self.verbose:
51
+ print(' [%s] is non-intrusive. Processing first file only'
52
+ % self.name)
53
+ test_files = [test_files[0],]
54
+ for file in test_files:
55
+ # Loading sound file
56
+ if isinstance(file, str):
57
+ audio, rate = sf.read(file, always_2d=True)
58
+ else:
59
+ rate = array_rate
60
+ if rate is None:
61
+ raise ValueError('Sampling rate needs to be specified '
62
+ 'when feeding numpy arrays.')
63
+ audio = file
64
+ # Standardize shapes
65
+ if len(audio.shape) == 1:
66
+ audio = audio[:, None]
67
+ if len(audio.shape) != 2:
68
+ raise ValueError('Please provide 1D or 2D array, received '
69
+ '{}D array'.format(len(audio.shape)))
70
+
71
+ if self.fixed_rate is not None and rate != self.fixed_rate:
72
+ if self.verbose:
73
+ print(' [%s] preferred is %dkHz rate. resampling'
74
+ % (self.name, self.fixed_rate))
75
+ audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
76
+ rate = self.fixed_rate
77
+ if self.mono and audio.shape[1] > 1:
78
+ if self.verbose:
79
+ print(' [%s] only supports mono. Will use first channel'
80
+ % self.name)
81
+ audio = audio[..., 0, None]
82
+ if self.mono:
83
+ audio = audio[..., 0]
84
+ maxlen = max(maxlen, audio.shape[0])
85
+ audios += [audio]
86
+ audio = audios[1]
87
+ audio[:maxlen-320] = audio[320:]
88
+ audios[1] = audio
89
+ for index, audio in enumerate(audios):
90
+ if audio.shape[0] != maxlen:
91
+ new = np.zeros((maxlen,) + audio.shape[1:])
92
+ new[:audio.shape[0]] = audio
93
+ audios[index] = new
94
+
95
+ if self.window is not None:
96
+ framer = Framing(self.window * rate,
97
+ self.hop * rate, maxlen)
98
+ nwin = framer.nwin
99
+ result = {}
100
+ for (t, win) in enumerate(framer):
101
+ result_t = self.test_window([audio[win] for audio in audios],
102
+ rate)
103
+ #or metric in result_t.keys():
104
+ # if metric not in result.keys():
105
+ # result[metric] = np.empty(nwin)
106
+ # result[metric][t] = result_t[metric]
107
+ result[t] = result_t
108
+ else:
109
+ result = self.test_window(audios, rate)
110
+ return result
111
+ """
112
+
113
+
demo.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import pprint for pretty-printing the results in a more readable format
2
+ import pprint
3
+ # Import the SpeechScore class to evaluate speech quality metrics
4
+ from speechscore import SpeechScore
5
+
6
+ # Main block to ensure the code runs only when executed directly
7
+ if __name__ == '__main__':
8
+ # Initialize a SpeechScore object with a list of score metrics to be evaluated
9
+ # Supports any subsets of the list
10
+ mySpeechScore = SpeechScore([
11
+ 'SRMR', 'PESQ', 'NB_PESQ', 'STOI', 'SISDR',
12
+ 'FWSEGSNR', 'LSD', 'BSSEval', 'DNSMOS',
13
+ 'SNR', 'SSNR', 'LLR', 'CSIG', 'CBAK',
14
+ 'COVL', 'MCD'
15
+ ])
16
+
17
+ # Call the SpeechScore object to evaluate the speech metrics between 'noisy' and 'clean' audio
18
+ # Arguments:
19
+ # - {test_path, reference_path} supports audio directories or audio paths (.wav or .flac)
20
+ # - window (float): seconds, set None to specify no windowing (process the full audio)
21
+ # - score_rate (int): specifies the sampling rate at which the metrics should be computed
22
+ # - return_mean (bool): set True to specify that the mean score for each metric should be returned
23
+ scores = mySpeechScore(test_path='audios/noisy/', reference_path='audios/clean/', window=None, score_rate=16000, return_mean=True)
24
+
25
+ # Pretty-print the resulting scores in a readable format
26
+ pprint.pprint(scores)
27
+
28
+ # Print only the resulting mean scores in a readable format
29
+ pprint.pprint(scores['Mean_Score'])
requirement.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pysptk
2
+ pymcd
3
+ pyworld
4
+ fastdtw
5
+ museval
scores/__init__.py ADDED
File without changes
scores/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (187 Bytes). View file
 
scores/__pycache__/bsseval.cpython-38.pyc ADDED
Binary file (1.15 kB). View file
 
scores/__pycache__/cbak.cpython-38.pyc ADDED
Binary file (1.48 kB). View file
 
scores/__pycache__/covl.cpython-38.pyc ADDED
Binary file (1.53 kB). View file
 
scores/__pycache__/csig.cpython-38.pyc ADDED
Binary file (1.52 kB). View file
 
scores/__pycache__/fwsegsnr.cpython-38.pyc ADDED
Binary file (2.09 kB). View file
 
scores/__pycache__/helper.cpython-38.pyc ADDED
Binary file (6.64 kB). View file
 
scores/__pycache__/llr.cpython-38.pyc ADDED
Binary file (2.09 kB). View file
 
scores/__pycache__/lsd.cpython-38.pyc ADDED
Binary file (1.5 kB). View file
 
scores/__pycache__/mcd.cpython-38.pyc ADDED
Binary file (4.65 kB). View file
 
scores/__pycache__/nb_pesq.cpython-38.pyc ADDED
Binary file (922 Bytes). View file
 
scores/__pycache__/pesq.cpython-38.pyc ADDED
Binary file (921 Bytes). View file
 
scores/__pycache__/sisdr.cpython-38.pyc ADDED
Binary file (1.2 kB). View file
 
scores/__pycache__/snr.cpython-38.pyc ADDED
Binary file (1.52 kB). View file
 
scores/__pycache__/ssnr.cpython-38.pyc ADDED
Binary file (2.05 kB). View file
 
scores/__pycache__/stoi.cpython-38.pyc ADDED
Binary file (926 Bytes). View file
 
scores/bsseval.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from basis import ScoreBasis
3
+
4
+
5
+ class BSSEval(ScoreBasis):
6
+ def __init__(self):
7
+ super(BSSEval, self).__init__(name='BSSEval')
8
+ self.intrusive = False
9
+
10
+ def windowed_scoring(self, audios, score_rate):
11
+ bss_window = np.inf
12
+ bss_hop = np.inf
13
+ from museval.metrics import bss_eval
14
+ if len(audios) != 2:
15
+ raise ValueError('BSSEval needs a reference and a test signals.')
16
+
17
+ result = bss_eval(reference_sources=audios[1][None,...], # shape: [nsrc, nsample, nchannels]
18
+ estimated_sources=audios[0][None,...],
19
+ window=bss_window * score_rate,
20
+ hop=bss_hop * score_rate)
21
+ return {'SDR': result[0][0][0], 'ISR': result[1][0][0], 'SAR': result[3][0][0]}
scores/cbak.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import numpy as np
3
+ from pesq import pesq
4
+ from scores.helper import wss, llr, SSNR, trim_mos
5
+
6
+ class CBAK(ScoreBasis):
7
+ def __init__(self):
8
+ super(CBAK, self).__init__(name='CBAK')
9
+ self.score_rate = 16000
10
+ self.intrusive = False
11
+
12
+ def windowed_scoring(self, audios, score_rate):
13
+ if len(audios) != 2:
14
+ raise ValueError('CBAK needs a reference and a test signals.')
15
+ return cal_CBAK(audios[0], audios[1], score_rate)
16
+
17
+ def cal_CBAK(target_wav, pred_wav, fs):
18
+ alpha = 0.95
19
+
20
+ # Compute WSS measure
21
+ wss_dist_vec = wss(target_wav, pred_wav, fs)
22
+ wss_dist_vec = sorted(wss_dist_vec, reverse=False)
23
+ wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
24
+
25
+ # Compute the SSNR
26
+ snr_mean, segsnr_mean = SSNR(target_wav, pred_wav, fs)
27
+ segSNR = np.mean(segsnr_mean)
28
+
29
+ # Compute the PESQ
30
+ pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
31
+
32
+ # Cbak
33
+ Cbak = 1.634 + 0.478 * pesq_raw - 0.007 * wss_dist + 0.063 * segSNR
34
+ Cbak = trim_mos(Cbak)
35
+
36
+ return Cbak
37
+
scores/covl.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import numpy as np
3
+ from pesq import pesq
4
+ from scores.helper import wss, llr, SSNR, trim_mos
5
+
6
+ class COVL(ScoreBasis):
7
+ def __init__(self):
8
+ super(COVL, self).__init__(name='COVL')
9
+ self.score_rate = 16000
10
+ self.intrusive = False
11
+
12
+ def windowed_scoring(self, audios, score_rate):
13
+ if len(audios) != 2:
14
+ raise ValueError('COVL needs a reference and a test signals.')
15
+ return cal_COVL(audios[0], audios[1], score_rate)
16
+
17
+ def cal_COVL(target_wav, pred_wav, fs):
18
+ alpha = 0.95
19
+
20
+ # Compute WSS measure
21
+ wss_dist_vec = wss(target_wav, pred_wav, fs)
22
+ wss_dist_vec = sorted(wss_dist_vec, reverse=False)
23
+ wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
24
+
25
+ # Compute LLR measure
26
+ LLR_dist = llr(target_wav, pred_wav, fs)
27
+ LLR_dist = sorted(LLR_dist, reverse=False)
28
+ LLRs = LLR_dist
29
+ LLR_len = round(len(LLR_dist) * alpha)
30
+ llr_mean = np.mean(LLRs[:LLR_len])
31
+
32
+ # Compute the PESQ
33
+ pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
34
+
35
+ # Covl
36
+ Covl = 1.594 + 0.805 * pesq_raw - 0.512 * llr_mean - 0.007 * wss_dist
37
+ Covl = trim_mos(Covl)
38
+
39
+ return Covl
scores/csig.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import numpy as np
3
+ from pesq import pesq
4
+ from scores.helper import wss, llr, SSNR, trim_mos
5
+
6
+ class CSIG(ScoreBasis):
7
+ def __init__(self):
8
+ super(CSIG, self).__init__(name='CSIG')
9
+ self.score_rate = 16000
10
+
11
+ def windowed_scoring(self, audios, score_rate):
12
+ if len(audios) != 2:
13
+ raise ValueError('CSIG needs a reference and a test signals.')
14
+ return cal_CSIG(audios[0], audios[1], score_rate)
15
+
16
+ def cal_CSIG(target_wav, pred_wav, fs):
17
+ alpha = 0.95
18
+
19
+ # Compute WSS measure
20
+ wss_dist_vec = wss(target_wav, pred_wav, fs)
21
+ wss_dist_vec = sorted(wss_dist_vec, reverse=False)
22
+ wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
23
+
24
+ # Compute LLR measure
25
+ LLR_dist = llr(target_wav, pred_wav, fs)
26
+ LLR_dist = sorted(LLR_dist, reverse=False)
27
+ LLRs = LLR_dist
28
+ LLR_len = round(len(LLR_dist) * alpha)
29
+ llr_mean = np.mean(LLRs[:LLR_len])
30
+
31
+ # Compute the PESQ
32
+ pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
33
+
34
+ # Csig
35
+ Csig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_raw - 0.009 * wss_dist
36
+ Csig = float(trim_mos(Csig))
37
+
38
+ return Csig
scores/dnsmos/DNSMOS/bak_ovr.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f335c90994618150192a656a474bcf8a9cbcedbc47965494ba8da79605d1308
3
+ size 742375
scores/dnsmos/DNSMOS/model_v8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91
3
+ size 224860
scores/dnsmos/DNSMOS/sig.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2fbdb293bc2366dfbae2b7477c490f981d24a8b4405efd3c11787569c6549d7
3
+ size 742203
scores/dnsmos/DNSMOS/sig_bak_ovr.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd
3
+ size 1157965
scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc ADDED
Binary file (3.63 kB). View file
 
scores/dnsmos/dnsmos.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import numpy.polynomial.polynomial as poly
6
+ import onnxruntime as ort
7
+ import soundfile as sf
8
+
9
+ SAMPLING_RATE = 16000
10
+ INPUT_LENGTH = 9.01
11
+
12
+ from basis import ScoreBasis
13
+
14
+
15
+ class DNSMOS(ScoreBasis):
16
+ def __init__(self):
17
+ super(DNSMOS, self).__init__(name='DNSMOS')
18
+ self.intrusive = True
19
+ self.score_rate = 16000
20
+ self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')
21
+ self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
22
+ self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
23
+
24
+ def windowed_scoring(self, audios, rate):
25
+ if len(audios) == 2:
26
+ return self.compute_score.cal_mos(audios[1], rate)
27
+ else:
28
+ return self.compute_score.cal_mos(audios[0], rate)
29
+
30
+ class ComputeScore:
31
+ def __init__(self, primary_model_path, p808_model_path) -> None:
32
+ self.onnx_sess = ort.InferenceSession(primary_model_path)
33
+ self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
34
+
35
+ def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
36
+ mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
37
+ if to_db:
38
+ mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
39
+ return mel_spec.T
40
+
41
+ def get_polyfit_val(self, sig, bak, ovr):
42
+ p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
43
+ p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ])
44
+ p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
45
+
46
+ sig_poly = p_sig(sig)
47
+ bak_poly = p_bak(bak)
48
+ ovr_poly = p_ovr(ovr)
49
+
50
+ return sig_poly, bak_poly, ovr_poly
51
+
52
+ def cal_mos(self, audio, sampling_rate):
53
+ fs = sampling_rate
54
+ actual_audio_len = len(audio)
55
+ len_samples = int(INPUT_LENGTH*fs)
56
+ while len(audio) < len_samples:
57
+ audio = np.append(audio, audio)
58
+
59
+ num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
60
+ hop_len_samples = fs
61
+ predicted_mos_sig_seg_raw = []
62
+ predicted_mos_bak_seg_raw = []
63
+ predicted_mos_ovr_seg_raw = []
64
+ predicted_mos_sig_seg = []
65
+ predicted_mos_bak_seg = []
66
+ predicted_mos_ovr_seg = []
67
+ predicted_p808_mos = []
68
+
69
+ for idx in range(num_hops):
70
+ audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
71
+ if len(audio_seg) < len_samples:
72
+ continue
73
+
74
+ input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
75
+ p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
76
+ oi = {'input_1': input_features}
77
+ p808_oi = {'input_1': p808_input_features}
78
+ p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
79
+ mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
80
+ mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw)
81
+ predicted_mos_sig_seg_raw.append(mos_sig_raw)
82
+ predicted_mos_bak_seg_raw.append(mos_bak_raw)
83
+ predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
84
+ predicted_mos_sig_seg.append(mos_sig)
85
+ predicted_mos_bak_seg.append(mos_bak)
86
+ predicted_mos_ovr_seg.append(mos_ovr)
87
+ predicted_p808_mos.append(p808_mos)
88
+
89
+ results = {}
90
+ results['OVRL'] = np.mean(predicted_mos_ovr_seg)
91
+ results['SIG'] = np.mean(predicted_mos_sig_seg)
92
+ results['BAK'] = np.mean(predicted_mos_bak_seg)
93
+ results['P808_MOS'] = np.mean(predicted_p808_mos)
94
+ return results
scores/fwsegsnr.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ from basis import ScoreBasis
4
+
5
+ class FWSEGSNR(ScoreBasis):
6
+ def __init__(self):
7
+ super(FWSEGSNR, self).__init__(name='FWSEGSNR')
8
+ self.intrusive = False
9
+
10
+ def windowed_scoring(self, audios, score_rate):
11
+ if len(audios) != 2:
12
+ raise ValueError('FWSEGSNR needs a reference and a test signals.')
13
+ return fwsegsnr(audios[1], audios[0], score_rate)
14
+
15
+ def fwsegsnr(x, y, fs, frame_sz = 0.025, shift_sz= 0.01, win='hann', numband=23):
16
+ epsilon = np.finfo(np.float32).eps
17
+ frame = int(np.fix(frame_sz * fs))
18
+ shift = int(np.fix(shift_sz * fs))
19
+ window = win
20
+ nband = numband
21
+ noverlap = frame - shift
22
+ fftpt = int(2**np.ceil(np.log2(np.abs(frame))))
23
+ x = x / np.sqrt(sum(np.power(x, 2)))
24
+ y = y / np.sqrt(sum(np.power(y, 2)))
25
+
26
+ assert len(x) == len(y), print('Wav length are not matched!')
27
+ X_stft = np.abs(librosa.stft(x, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
28
+ Y_stft = np.abs(librosa.stft(y, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
29
+
30
+ num_freq = X_stft.shape[0]
31
+ num_frame = X_stft.shape[1]
32
+
33
+ X_mel = librosa.feature.melspectrogram(S=X_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
34
+ Y_mel = librosa.feature.melspectrogram(S=Y_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
35
+
36
+ # Calculate SNR.
37
+
38
+ W = np.power(Y_mel, 0.2)
39
+ E = X_mel - Y_mel
40
+ E[E == 0.0] = epsilon
41
+ E_power = np.power(E, 2)
42
+ Y_div_E = np.divide((np.power(Y_mel,2)), (np.power(E,2)))
43
+ Y_div_E[Y_div_E==0] = epsilon
44
+ ds = 10 * np.divide(np.sum(np.multiply(W, np.log10(Y_div_E)), 1), np.sum(W, 1))
45
+ ds[ds > 35] = 35
46
+ ds[ds < -10] = -10
47
+ d = np.mean(ds)
48
+ return d
49
+
scores/helper.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modifications in Metrics
3
+
4
+ # Original copyright:
5
+ # Copyright (c) Facebook, Inc. and its affiliates.
6
+ # Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
7
+ """
8
+ import numpy as np
9
+ from scipy.linalg import toeplitz
10
+
11
+ # ----------------------------- HELPERS ------------------------------------ #
12
+ def trim_mos(val):
13
+ return min(max(val, 1), 5)
14
+
15
+ def lpcoeff(speech_frame, model_order):
16
+ # (1) Compute Autocor lags
17
+ winlength = speech_frame.shape[0]
18
+ R = []
19
+ for k in range(model_order + 1):
20
+ first = speech_frame[:(winlength - k)]
21
+ second = speech_frame[k:winlength]
22
+ R.append(np.sum(first * second))
23
+
24
+ # (2) Lev-Durbin
25
+ a = np.ones((model_order,))
26
+ E = np.zeros((model_order + 1,))
27
+ rcoeff = np.zeros((model_order,))
28
+ E[0] = R[0]
29
+ for i in range(model_order):
30
+ if i == 0:
31
+ sum_term = 0
32
+ else:
33
+ a_past = a[:i]
34
+ sum_term = np.sum(a_past * np.array(R[i:0:-1]))
35
+ rcoeff[i] = (R[i+1] - sum_term)/E[i]
36
+ a[i] = rcoeff[i]
37
+ if i > 0:
38
+ a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
39
+ E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
40
+ acorr = np.array(R, dtype=np.float32)
41
+ refcoeff = np.array(rcoeff, dtype=np.float32)
42
+ a = a * -1
43
+ lpparams = np.array([1] + list(a), dtype=np.float32)
44
+ acorr = np.array(acorr, dtype=np.float32)
45
+ refcoeff = np.array(refcoeff, dtype=np.float32)
46
+ lpparams = np.array(lpparams, dtype=np.float32)
47
+
48
+ return acorr, refcoeff, lpparams
49
+ # -------------------------------------------------------------------------- #
50
+
51
+
52
+ def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
53
+ """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
54
+ This function implements the segmental signal-to-noise ratio
55
+ as defined in [1, p. 45] (see Equation 2.12).
56
+ """
57
+ clean_speech = ref_wav
58
+ processed_speech = deg_wav
59
+ clean_length = ref_wav.shape[0]
60
+ processed_length = deg_wav.shape[0]
61
+
62
+ # scale both to have same dynamic range. Remove DC too.
63
+ clean_speech -= clean_speech.mean()
64
+ processed_speech -= processed_speech.mean()
65
+ processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
66
+
67
+ # Signal-to-Noise Ratio
68
+ dif = ref_wav - deg_wav
69
+ overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
70
+ 10e-20))
71
+ # global variables
72
+ winlength = int(np.round(30 * srate / 1000)) # 30 msecs
73
+ skiprate = winlength // 4
74
+ MIN_SNR = -10
75
+ MAX_SNR = 35
76
+
77
+ # For each frame, calculate SSNR
78
+ num_frames = int(clean_length / skiprate - (winlength/skiprate))
79
+ start = 0
80
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
81
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
82
+ segmental_snr = []
83
+
84
+ for frame_count in range(int(num_frames)):
85
+ # (1) get the frames for the test and ref speech.
86
+ # Apply Hanning Window
87
+ clean_frame = clean_speech[start:start+winlength]
88
+ processed_frame = processed_speech[start:start+winlength]
89
+ clean_frame = clean_frame * window
90
+ processed_frame = processed_frame * window
91
+
92
+ # (2) Compute Segmental SNR
93
+ signal_energy = np.sum(clean_frame ** 2)
94
+ noise_energy = np.sum((clean_frame - processed_frame) ** 2)
95
+ segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
96
+ segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
97
+ segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
98
+ start += int(skiprate)
99
+ return overall_snr, segmental_snr
100
+
101
+
102
+ def wss(ref_wav, deg_wav, srate):
103
+ clean_speech = ref_wav
104
+ processed_speech = deg_wav
105
+ clean_length = ref_wav.shape[0]
106
+ processed_length = deg_wav.shape[0]
107
+
108
+ assert clean_length == processed_length, clean_length
109
+
110
+ winlength = round(30 * srate / 1000.) # 240 wlen in samples
111
+ skiprate = np.floor(winlength / 4)
112
+ max_freq = srate / 2
113
+ num_crit = 25 # num of critical bands
114
+
115
+ USE_FFT_SPECTRUM = 1
116
+ n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
117
+ n_fftby2 = int(n_fft / 2)
118
+ Kmax = 20
119
+ Klocmax = 1
120
+
121
+ # Critical band filter definitions (Center frequency and BW in Hz)
122
+ cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
123
+ 703.378, 798.717, 904.128, 1020.38, 1148.30,
124
+ 1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
125
+ 2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
126
+ 3597.63]
127
+ bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
128
+ 95.3398, 105.411, 116.256, 127.914, 140.423,
129
+ 153.823, 168.154, 183.457, 199.776, 217.153,
130
+ 235.631, 255.255, 276.072, 298.126, 321.465,
131
+ 346.136]
132
+
133
+ bw_min = bandwidth[0] # min critical bandwidth
134
+
135
+ # set up critical band filters. Note here that Gaussianly shaped filters
136
+ # are used. Also, the sum of the filter weights are equivalent for each
137
+ # critical band filter. Filter less than -30 dB and set to zero.
138
+ min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
139
+
140
+ crit_filter = np.zeros((num_crit, n_fftby2))
141
+ all_f0 = []
142
+ for i in range(num_crit):
143
+ f0 = (cent_freq[i] / max_freq) * (n_fftby2)
144
+ all_f0.append(np.floor(f0))
145
+ bw = (bandwidth[i] / max_freq) * (n_fftby2)
146
+ norm_factor = np.log(bw_min) - np.log(bandwidth[i])
147
+ j = list(range(n_fftby2))
148
+ crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
149
+ norm_factor)
150
+ crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
151
+ min_factor)
152
+
153
+ # For each frame of input speech, compute Weighted Spectral Slope Measure
154
+ num_frames = int(clean_length / skiprate - (winlength / skiprate))
155
+ start = 0 # starting sample
156
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
157
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
158
+ distortion = []
159
+
160
+ for frame_count in range(num_frames):
161
+ # (1) Get the Frames for the test and reference speeech.
162
+ # Multiply by Hanning window.
163
+ clean_frame = clean_speech[start:start+winlength]
164
+ processed_frame = processed_speech[start:start+winlength]
165
+ clean_frame = clean_frame * window
166
+ processed_frame = processed_frame * window
167
+
168
+ # (2) Compuet Power Spectrum of clean and processed
169
+ clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
170
+ processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
171
+ clean_energy = [None] * num_crit
172
+ processed_energy = [None] * num_crit
173
+
174
+ # (3) Compute Filterbank output energies (in dB)
175
+ for i in range(num_crit):
176
+ clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
177
+ crit_filter[i, :])
178
+ processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
179
+ crit_filter[i, :])
180
+ clean_energy = np.array(clean_energy).reshape(-1, 1)
181
+ eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
182
+ clean_energy = np.concatenate((clean_energy, eps), axis=1)
183
+ clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
184
+ processed_energy = np.array(processed_energy).reshape(-1, 1)
185
+ processed_energy = np.concatenate((processed_energy, eps), axis=1)
186
+ processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
187
+
188
+ # (4) Compute Spectral Shape (dB[i+1] - dB[i])
189
+ clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
190
+ processed_slope = processed_energy[1:num_crit] - \
191
+ processed_energy[:num_crit-1]
192
+
193
+ # (5) Find the nearest peak locations in the spectra to each
194
+ # critical band. If the slope is negative, we search
195
+ # to the left. If positive, we search to the right.
196
+ clean_loc_peak = []
197
+ processed_loc_peak = []
198
+ for i in range(num_crit - 1):
199
+ if clean_slope[i] > 0:
200
+ # search to the right
201
+ n = i
202
+ while n < num_crit - 1 and clean_slope[n] > 0:
203
+ n += 1
204
+ clean_loc_peak.append(clean_energy[n - 1])
205
+ else:
206
+ # search to the left
207
+ n = i
208
+ while n >= 0 and clean_slope[n] <= 0:
209
+ n -= 1
210
+ clean_loc_peak.append(clean_energy[n + 1])
211
+ # find the peaks in the processed speech signal
212
+ if processed_slope[i] > 0:
213
+ n = i
214
+ while n < num_crit - 1 and processed_slope[n] > 0:
215
+ n += 1
216
+ processed_loc_peak.append(processed_energy[n - 1])
217
+ else:
218
+ n = i
219
+ while n >= 0 and processed_slope[n] <= 0:
220
+ n -= 1
221
+ processed_loc_peak.append(processed_energy[n + 1])
222
+
223
+ # (6) Compuet the WSS Measure for this frame. This includes
224
+ # determination of the weighting functino
225
+ dBMax_clean = max(clean_energy)
226
+ dBMax_processed = max(processed_energy)
227
+
228
+ # The weights are calculated by averaging individual
229
+ # weighting factors from the clean and processed frame.
230
+ # These weights W_clean and W_processed should range
231
+ # from 0 to 1 and place more emphasis on spectral
232
+ # peaks and less emphasis on slope differences in spectral
233
+ # valleys. This procedure is described on page 1280 of
234
+ # Klatt's 1982 ICASSP paper.
235
+ clean_loc_peak = np.array(clean_loc_peak)
236
+ processed_loc_peak = np.array(processed_loc_peak)
237
+ Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
238
+ Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
239
+ clean_energy[:num_crit-1])
240
+ W_clean = Wmax_clean * Wlocmax_clean
241
+ Wmax_processed = Kmax / (Kmax + dBMax_processed - \
242
+ processed_energy[:num_crit-1])
243
+ Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
244
+ processed_energy[:num_crit-1])
245
+ W_processed = Wmax_processed * Wlocmax_processed
246
+ W = (W_clean + W_processed) / 2
247
+ distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
248
+ processed_slope[:num_crit - 1]) ** 2))
249
+
250
+ # this normalization is not part of Klatt's paper, but helps
251
+ # to normalize the meaasure. Here we scale the measure by the sum of the
252
+ # weights
253
+ distortion[frame_count] = distortion[frame_count] / np.sum(W)
254
+ start += int(skiprate)
255
+ return distortion
256
+
257
+
258
+ def llr(ref_wav, deg_wav, srate):
259
+ clean_speech = ref_wav
260
+ processed_speech = deg_wav
261
+ clean_length = ref_wav.shape[0]
262
+ processed_length = deg_wav.shape[0]
263
+ assert clean_length == processed_length, clean_length
264
+
265
+ winlength = round(30 * srate / 1000.) # 240 wlen in samples
266
+ skiprate = np.floor(winlength / 4)
267
+ if srate < 10000:
268
+ # LPC analysis order
269
+ P = 10
270
+ else:
271
+ P = 16
272
+
273
+ # For each frame of input speech, calculate the Log Likelihood Ratio
274
+ num_frames = int(clean_length / skiprate - (winlength / skiprate))
275
+ start = 0
276
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
277
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
278
+ distortion = []
279
+
280
+ for frame_count in range(num_frames):
281
+ # (1) Get the Frames for the test and reference speeech.
282
+ # Multiply by Hanning window.
283
+ clean_frame = clean_speech[start:start+winlength]
284
+ processed_frame = processed_speech[start:start+winlength]
285
+ clean_frame = clean_frame * window
286
+ processed_frame = processed_frame * window
287
+
288
+ # (2) Get the autocorrelation logs and LPC params used
289
+ # to compute the LLR measure
290
+ R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
291
+ R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
292
+ A_clean = A_clean[None, :]
293
+ A_processed = A_processed[None, :]
294
+
295
+ # (3) Compute the LLR measure
296
+ numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
297
+ denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
298
+
299
+ if (numerator/denominator) <= 0:
300
+ print(f'Numerator: {numerator}')
301
+ print(f'Denominator: {denominator}')
302
+
303
+ log_ = np.log(numerator / denominator)
304
+ distortion.append(np.squeeze(log_))
305
+ start += int(skiprate)
306
+ return np.nan_to_num(np.array(distortion))
307
+ # -------------------------------------------------------------------------- #
scores/helper_bk.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modifications in Metrics
3
+
4
+ # Original copyright:
5
+ # Copyright (c) Facebook, Inc. and its affiliates.
6
+ # Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
7
+ """
8
+ import numpy as np
9
+ from scipy.linalg import toeplitz
10
+
11
+ # ----------------------------- HELPERS ------------------------------------ #
12
+ def trim_mos(val):
13
+ return min(max(val, 1), 5)
14
+
15
+ def lpcoeff(speech_frame, model_order):
16
+ # (1) Compute Autocor lags
17
+ winlength = speech_frame.shape[0]
18
+ R = []
19
+ for k in range(model_order + 1):
20
+ first = speech_frame[:(winlength - k)]
21
+ second = speech_frame[k:winlength]
22
+ R.append(np.sum(first * second))
23
+
24
+ # (2) Lev-Durbin
25
+ a = np.ones((model_order,))
26
+ E = np.zeros((model_order + 1,))
27
+ rcoeff = np.zeros((model_order,))
28
+ E[0] = R[0]
29
+ for i in range(model_order):
30
+ if i == 0:
31
+ sum_term = 0
32
+ else:
33
+ a_past = a[:i]
34
+ sum_term = np.sum(a_past * np.array(R[i:0:-1]))
35
+ rcoeff[i] = (R[i+1] - sum_term)/E[i]
36
+ a[i] = rcoeff[i]
37
+ if i > 0:
38
+ a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
39
+ E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
40
+ acorr = np.array(R, dtype=np.float32)
41
+ refcoeff = np.array(rcoeff, dtype=np.float32)
42
+ a = a * -1
43
+ lpparams = np.array([1] + list(a), dtype=np.float32)
44
+ acorr = np.array(acorr, dtype=np.float32)
45
+ refcoeff = np.array(refcoeff, dtype=np.float32)
46
+ lpparams = np.array(lpparams, dtype=np.float32)
47
+
48
+ return acorr, refcoeff, lpparams
49
+ # -------------------------------------------------------------------------- #
50
+
51
+
52
+ def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
53
+ """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
54
+ This function implements the segmental signal-to-noise ratio
55
+ as defined in [1, p. 45] (see Equation 2.12).
56
+ """
57
+ clean_speech = ref_wav
58
+ processed_speech = deg_wav
59
+ clean_length = ref_wav.shape[0]
60
+ processed_length = deg_wav.shape[0]
61
+
62
+ # scale both to have same dynamic range. Remove DC too.
63
+ clean_speech -= clean_speech.mean()
64
+ processed_speech -= processed_speech.mean()
65
+ processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
66
+
67
+ # Signal-to-Noise Ratio
68
+ dif = ref_wav - deg_wav
69
+ overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
70
+ 10e-20))
71
+ # global variables
72
+ winlength = int(np.round(30 * srate / 1000)) # 30 msecs
73
+ skiprate = winlength // 4
74
+ MIN_SNR = -10
75
+ MAX_SNR = 35
76
+
77
+ # For each frame, calculate SSNR
78
+ num_frames = int(clean_length / skiprate - (winlength/skiprate))
79
+ start = 0
80
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
81
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
82
+ segmental_snr = []
83
+
84
+ for frame_count in range(int(num_frames)):
85
+ # (1) get the frames for the test and ref speech.
86
+ # Apply Hanning Window
87
+ clean_frame = clean_speech[start:start+winlength]
88
+ processed_frame = processed_speech[start:start+winlength]
89
+ clean_frame = clean_frame * window
90
+ processed_frame = processed_frame * window
91
+
92
+ # (2) Compute Segmental SNR
93
+ signal_energy = np.sum(clean_frame ** 2)
94
+ noise_energy = np.sum((clean_frame - processed_frame) ** 2)
95
+ segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
96
+ segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
97
+ segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
98
+ start += int(skiprate)
99
+ return overall_snr, segmental_snr
100
+
101
+
102
+ def wss(ref_wav, deg_wav, srate):
103
+ clean_speech = ref_wav
104
+ processed_speech = deg_wav
105
+ clean_length = ref_wav.shape[0]
106
+ processed_length = deg_wav.shape[0]
107
+
108
+ assert clean_length == processed_length, clean_length
109
+
110
+ winlength = round(30 * srate / 1000.) # 240 wlen in samples
111
+ skiprate = np.floor(winlength / 4)
112
+ max_freq = srate / 2
113
+ num_crit = 25 # num of critical bands
114
+
115
+ USE_FFT_SPECTRUM = 1
116
+ n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
117
+ n_fftby2 = int(n_fft / 2)
118
+ Kmax = 20
119
+ Klocmax = 1
120
+
121
+ # Critical band filter definitions (Center frequency and BW in Hz)
122
+ cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
123
+ 703.378, 798.717, 904.128, 1020.38, 1148.30,
124
+ 1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
125
+ 2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
126
+ 3597.63]
127
+ bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
128
+ 95.3398, 105.411, 116.256, 127.914, 140.423,
129
+ 153.823, 168.154, 183.457, 199.776, 217.153,
130
+ 235.631, 255.255, 276.072, 298.126, 321.465,
131
+ 346.136]
132
+
133
+ bw_min = bandwidth[0] # min critical bandwidth
134
+
135
+ # set up critical band filters. Note here that Gaussianly shaped filters
136
+ # are used. Also, the sum of the filter weights are equivalent for each
137
+ # critical band filter. Filter less than -30 dB and set to zero.
138
+ min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
139
+
140
+ crit_filter = np.zeros((num_crit, n_fftby2))
141
+ all_f0 = []
142
+ for i in range(num_crit):
143
+ f0 = (cent_freq[i] / max_freq) * (n_fftby2)
144
+ all_f0.append(np.floor(f0))
145
+ bw = (bandwidth[i] / max_freq) * (n_fftby2)
146
+ norm_factor = np.log(bw_min) - np.log(bandwidth[i])
147
+ j = list(range(n_fftby2))
148
+ crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
149
+ norm_factor)
150
+ crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
151
+ min_factor)
152
+
153
+ # For each frame of input speech, compute Weighted Spectral Slope Measure
154
+ num_frames = int(clean_length / skiprate - (winlength / skiprate))
155
+ start = 0 # starting sample
156
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
157
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
158
+ distortion = []
159
+
160
+ for frame_count in range(num_frames):
161
+ # (1) Get the Frames for the test and reference speeech.
162
+ # Multiply by Hanning window.
163
+ clean_frame = clean_speech[start:start+winlength]
164
+ processed_frame = processed_speech[start:start+winlength]
165
+ clean_frame = clean_frame * window
166
+ processed_frame = processed_frame * window
167
+
168
+ # (2) Compuet Power Spectrum of clean and processed
169
+ clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
170
+ processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
171
+ clean_energy = [None] * num_crit
172
+ processed_energy = [None] * num_crit
173
+
174
+ # (3) Compute Filterbank output energies (in dB)
175
+ for i in range(num_crit):
176
+ clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
177
+ crit_filter[i, :])
178
+ processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
179
+ crit_filter[i, :])
180
+ clean_energy = np.array(clean_energy).reshape(-1, 1)
181
+ eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
182
+ clean_energy = np.concatenate((clean_energy, eps), axis=1)
183
+ clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
184
+ processed_energy = np.array(processed_energy).reshape(-1, 1)
185
+ processed_energy = np.concatenate((processed_energy, eps), axis=1)
186
+ processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
187
+
188
+ # (4) Compute Spectral Shape (dB[i+1] - dB[i])
189
+ clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
190
+ processed_slope = processed_energy[1:num_crit] - \
191
+ processed_energy[:num_crit-1]
192
+
193
+ # (5) Find the nearest peak locations in the spectra to each
194
+ # critical band. If the slope is negative, we search
195
+ # to the left. If positive, we search to the right.
196
+ clean_loc_peak = []
197
+ processed_loc_peak = []
198
+ for i in range(num_crit - 1):
199
+ if clean_slope[i] > 0:
200
+ # search to the right
201
+ n = i
202
+ while n < num_crit - 1 and clean_slope[n] > 0:
203
+ n += 1
204
+ clean_loc_peak.append(clean_energy[n - 1])
205
+ else:
206
+ # search to the left
207
+ n = i
208
+ while n >= 0 and clean_slope[n] <= 0:
209
+ n -= 1
210
+ clean_loc_peak.append(clean_energy[n + 1])
211
+ # find the peaks in the processed speech signal
212
+ if processed_slope[i] > 0:
213
+ n = i
214
+ while n < num_crit - 1 and processed_slope[n] > 0:
215
+ n += 1
216
+ processed_loc_peak.append(processed_energy[n - 1])
217
+ else:
218
+ n = i
219
+ while n >= 0 and processed_slope[n] <= 0:
220
+ n -= 1
221
+ processed_loc_peak.append(processed_energy[n + 1])
222
+
223
+ # (6) Compuet the WSS Measure for this frame. This includes
224
+ # determination of the weighting functino
225
+ dBMax_clean = max(clean_energy)
226
+ dBMax_processed = max(processed_energy)
227
+
228
+ # The weights are calculated by averaging individual
229
+ # weighting factors from the clean and processed frame.
230
+ # These weights W_clean and W_processed should range
231
+ # from 0 to 1 and place more emphasis on spectral
232
+ # peaks and less emphasis on slope differences in spectral
233
+ # valleys. This procedure is described on page 1280 of
234
+ # Klatt's 1982 ICASSP paper.
235
+ clean_loc_peak = np.array(clean_loc_peak)
236
+ processed_loc_peak = np.array(processed_loc_peak)
237
+ Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
238
+ Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
239
+ clean_energy[:num_crit-1])
240
+ W_clean = Wmax_clean * Wlocmax_clean
241
+ Wmax_processed = Kmax / (Kmax + dBMax_processed - \
242
+ processed_energy[:num_crit-1])
243
+ Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
244
+ processed_energy[:num_crit-1])
245
+ W_processed = Wmax_processed * Wlocmax_processed
246
+ W = (W_clean + W_processed) / 2
247
+ distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
248
+ processed_slope[:num_crit - 1]) ** 2))
249
+
250
+ # this normalization is not part of Klatt's paper, but helps
251
+ # to normalize the meaasure. Here we scale the measure by the sum of the
252
+ # weights
253
+ distortion[frame_count] = distortion[frame_count] / np.sum(W)
254
+ start += int(skiprate)
255
+ return distortion
256
+
257
+
258
+ def llr(ref_wav, deg_wav, srate):
259
+ clean_speech = ref_wav
260
+ processed_speech = deg_wav
261
+ clean_length = ref_wav.shape[0]
262
+ processed_length = deg_wav.shape[0]
263
+ assert clean_length == processed_length, clean_length
264
+
265
+ winlength = round(30 * srate / 1000.) # 240 wlen in samples
266
+ skiprate = np.floor(winlength / 4)
267
+ if srate < 10000:
268
+ # LPC analysis order
269
+ P = 10
270
+ else:
271
+ P = 16
272
+
273
+ # For each frame of input speech, calculate the Log Likelihood Ratio
274
+ num_frames = int(clean_length / skiprate - (winlength / skiprate))
275
+ start = 0
276
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
277
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
278
+ distortion = []
279
+
280
+ for frame_count in range(num_frames):
281
+ # (1) Get the Frames for the test and reference speeech.
282
+ # Multiply by Hanning window.
283
+ clean_frame = clean_speech[start:start+winlength]
284
+ processed_frame = processed_speech[start:start+winlength]
285
+ clean_frame = clean_frame * window
286
+ processed_frame = processed_frame * window
287
+
288
+ # (2) Get the autocorrelation logs and LPC params used
289
+ # to compute the LLR measure
290
+ R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
291
+ R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
292
+ A_clean = A_clean[None, :]
293
+ A_processed = A_processed[None, :]
294
+
295
+ # (3) Compute the LLR measure
296
+ numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
297
+ denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
298
+
299
+ if (numerator/denominator) <= 0:
300
+ print(f'Numerator: {numerator}')
301
+ print(f'Denominator: {denominator}')
302
+
303
+ log_ = np.log(numerator / denominator)
304
+ distortion.append(np.squeeze(log_))
305
+ start += int(skiprate)
306
+ return np.nan_to_num(np.array(distortion))
307
+ # -------------------------------------------------------------------------- #
308
+
309
+ #!/usr/bin/env python3
310
+
311
+ # Copyright 2020 Wen-Chin Huang and Tomoki Hayashi
312
+ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
313
+ # ported from https://github.com/espnet/espnet/blob/master/utils/mcd_calculate.py
314
+
315
+ """Evaluate MCD between generated and groundtruth audios with SPTK-based mcep."""
316
+
317
+ from typing import Tuple
318
+
319
+ import numpy as np
320
+ import pysptk
321
+ from fastdtw import fastdtw
322
+ from scipy import spatial
323
+
324
+
325
+ def sptk_extract(
326
+ x: np.ndarray,
327
+ fs: int,
328
+ n_fft: int = 512,
329
+ n_shift: int = 256,
330
+ mcep_dim: int = 25,
331
+ mcep_alpha: float = 0.41,
332
+ is_padding: bool = False,
333
+ ) -> np.ndarray:
334
+ """Extract SPTK-based mel-cepstrum.
335
+
336
+ Args:
337
+ x (ndarray): 1D waveform array.
338
+ fs (int): Sampling rate
339
+ n_fft (int): FFT length in point (default=512).
340
+ n_shift (int): Shift length in point (default=256).
341
+ mcep_dim (int): Dimension of mel-cepstrum (default=25).
342
+ mcep_alpha (float): All pass filter coefficient (default=0.41).
343
+ is_padding (bool): Whether to pad the end of signal (default=False).
344
+
345
+ Returns:
346
+ ndarray: Mel-cepstrum with the size (N, n_fft).
347
+
348
+ """
349
+ # perform padding
350
+ if is_padding:
351
+ n_pad = n_fft - (len(x) - n_fft) % n_shift
352
+ x = np.pad(x, (0, n_pad), "reflect")
353
+
354
+ # get number of frames
355
+ n_frame = (len(x) - n_fft) // n_shift + 1
356
+
357
+ # get window function
358
+ win = pysptk.sptk.hamming(n_fft)
359
+
360
+ # check mcep and alpha
361
+ if mcep_dim is None or mcep_alpha is None:
362
+ mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
363
+
364
+ # calculate spectrogram
365
+ mcep = [
366
+ pysptk.mcep(
367
+ x[n_shift * i : n_shift * i + n_fft] * win,
368
+ mcep_dim,
369
+ mcep_alpha,
370
+ eps=1e-6,
371
+ etype=1,
372
+ )
373
+ for i in range(n_frame)
374
+ ]
375
+
376
+ return np.stack(mcep)
377
+
378
+
379
+ def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
380
+ # https://sp-nitech.github.io/sptk/latest/main/mgcep.html#_CPPv4N4sptk19MelCepstralAnalysisE
381
+ if fs == 8000:
382
+ return 13, 0.31
383
+ elif fs == 16000:
384
+ return 23, 0.42
385
+ elif fs == 22050:
386
+ return 34, 0.45
387
+ elif fs == 24000:
388
+ return 34, 0.46
389
+ elif fs == 32000:
390
+ return 36, 0.50
391
+ elif fs == 44100:
392
+ return 39, 0.53
393
+ elif fs == 48000:
394
+ return 39, 0.55
395
+ else:
396
+ raise ValueError(f"Not found the setting for {fs}.")
397
+
398
+
399
+ def calculate_mcd(
400
+ inf_audio,
401
+ ref_audio,
402
+ fs,
403
+ n_fft=1024,
404
+ n_shift=256,
405
+ mcep_dim=None,
406
+ mcep_alpha=None,
407
+ ):
408
+ """Calculate MCD."""
409
+
410
+ # extract ground truth and converted features
411
+ gen_mcep = sptk_extract(
412
+ x=inf_audio,
413
+ fs=fs,
414
+ n_fft=n_fft,
415
+ n_shift=n_shift,
416
+ mcep_dim=mcep_dim,
417
+ mcep_alpha=mcep_alpha,
418
+ )
419
+ gt_mcep = sptk_extract(
420
+ x=ref_audio,
421
+ fs=fs,
422
+ n_fft=n_fft,
423
+ n_shift=n_shift,
424
+ mcep_dim=mcep_dim,
425
+ mcep_alpha=mcep_alpha,
426
+ )
427
+
428
+ # DTW
429
+ _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
430
+ twf = np.array(path).T
431
+ gen_mcep_dtw = gen_mcep[twf[0]]
432
+ gt_mcep_dtw = gt_mcep[twf[1]]
433
+
434
+ # MCD
435
+ diff2sum = np.sum((gen_mcep_dtw - gt_mcep_dtw) ** 2, 1)
436
+ mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
437
+
438
+ return mcd
scores/llr.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import numpy as np
3
+ from scipy.linalg import toeplitz
4
+ from scores.helper import lpcoeff
5
+
6
+ class LLR(ScoreBasis):
7
+ def __init__(self):
8
+ super(LLR, self).__init__(name='LLR')
9
+ self.intrusive = False
10
+
11
+ def windowed_scoring(self, audios, score_rate):
12
+ if len(audios) != 2:
13
+ raise ValueError('LLR needs a reference and a test signals.')
14
+ return cal_LLR(audios[0], audios[1], score_rate)
15
+
16
+ def cal_LLR(ref_wav, deg_wav, srate):
17
+ # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
18
+ clean_speech = ref_wav
19
+ processed_speech = deg_wav
20
+ clean_length = ref_wav.shape[0]
21
+ processed_length = deg_wav.shape[0]
22
+ assert clean_length == processed_length, clean_length
23
+
24
+ winlength = round(30 * srate / 1000.) # 240 wlen in samples
25
+ skiprate = np.floor(winlength / 4)
26
+ if srate < 10000:
27
+ # LPC analysis order
28
+ P = 10
29
+ else:
30
+ P = 16
31
+
32
+ # For each frame of input speech, calculate the Log Likelihood Ratio
33
+ num_frames = int(clean_length / skiprate - (winlength / skiprate))
34
+ start = 0
35
+ time = np.linspace(1, winlength, winlength) / (winlength + 1)
36
+ window = 0.5 * (1 - np.cos(2 * np.pi * time))
37
+ distortion = []
38
+
39
+ for frame_count in range(num_frames):
40
+ # (1) Get the Frames for the test and reference speeech.
41
+ # Multiply by Hanning window.
42
+ clean_frame = clean_speech[start:start+winlength]
43
+ processed_frame = processed_speech[start:start+winlength]
44
+ clean_frame = clean_frame * window
45
+ processed_frame = processed_frame * window
46
+
47
+ # (2) Get the autocorrelation logs and LPC params used
48
+ # to compute the LLR measure
49
+ R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
50
+ R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
51
+ A_clean = A_clean[None, :]
52
+ A_processed = A_processed[None, :]
53
+
54
+ # (3) Compute the LLR measure
55
+ numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
56
+ denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
57
+
58
+ if (numerator/denominator) <= 0:
59
+ print(f'Numerator: {numerator}')
60
+ print(f'Denominator: {denominator}')
61
+
62
+ log_ = np.log(numerator / denominator)
63
+ distortion.append(np.squeeze(log_))
64
+ start += int(skiprate)
65
+ return np.mean(np.nan_to_num(np.array(distortion)))
66
+
scores/lsd.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import numpy as np
3
+ import librosa
4
+
5
+ EPS = 1e-12
6
+
7
+ class LSD(ScoreBasis):
8
+ def __init__(self):
9
+ super(LSD, self).__init__(name='LSD')
10
+ self.intrusive = False
11
+ self.mono = True
12
+
13
+ def windowed_scoring(self, audios, score_rate):
14
+ if len(audios) != 2:
15
+ raise ValueError('NB_PESQ needs a reference and a test signals.')
16
+ est = wav_to_spectrogram(audios[1], score_rate)
17
+ target = wav_to_spectrogram(audios[0], score_rate)
18
+ return cal_LSD(est, target)
19
+
20
+ def wav_to_spectrogram(wav, rate):
21
+ hop_length = int(rate / 100)
22
+ n_fft = int(2048 / (48000 / rate))
23
+ spec = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))
24
+ spec = np.transpose(spec, (1, 0))
25
+ return spec
26
+
27
+ def cal_LSD(est, target):
28
+ log_ratio = np.log10(target**2 / ((est + EPS) ** 2) + EPS) ** 2
29
+ lsd_ = np.mean(np.mean(log_ratio, axis=1) ** 0.5, axis=0)
30
+ return lsd_
scores/mcd.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basis import ScoreBasis
2
+ import librosa
3
+ import math
4
+ import numpy as np
5
+ import pyworld
6
+ import pysptk
7
+ from fastdtw import fastdtw
8
+ from scipy.spatial.distance import euclidean
9
+ #from scores.helper import calculate_mcd
10
+ #from pymcd.mcd import Calculate_MCD
11
+ #refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
12
+ class MCD(ScoreBasis):
13
+ def __init__(self):
14
+ super(MCD, self).__init__(name='MCD')
15
+ self.intrusive = False
16
+ # three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
17
+ self.mcd_toolbox = Calculate_MCD(MCD_mode="plain")
18
+
19
+ def windowed_scoring(self, audios, score_rate):
20
+ if len(audios) != 2:
21
+ raise ValueError('MCD needs a reference and a test signals.')
22
+ return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate)
23
+
24
+ # ================================================= #
25
+ # calculate the Mel-Cepstral Distortion (MCD) value #
26
+ # ================================================= #
27
+ #refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
28
+ class Calculate_MCD(object):
29
+ """docstring for Calculate_MCD"""
30
+ def __init__(self, MCD_mode):
31
+ super(Calculate_MCD, self).__init__()
32
+ self.MCD_mode = MCD_mode
33
+ #self.SAMPLING_RATE = 22050
34
+ self.FRAME_PERIOD = 5.0
35
+ self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
36
+
37
+ def load_wav(self, wav_file, sample_rate):
38
+ """
39
+ Load a wav file with librosa.
40
+ :param wav_file: path to wav file
41
+ :param sr: sampling rate
42
+ :return: audio time series numpy array
43
+ """
44
+ wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True)
45
+ return wav
46
+
47
+ # distance metric
48
+ def log_spec_dB_dist(self, x, y):
49
+ # log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
50
+ diff = x - y
51
+ return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
52
+
53
+ # calculate distance (metric)
54
+ # def calculate_mcd_distance(self, x, y, distance, path):
55
+ def calculate_mcd_distance(self, x, y, path):
56
+ '''
57
+ param path: pairs between x and y
58
+ '''
59
+ pathx = list(map(lambda l: l[0], path))
60
+ pathy = list(map(lambda l: l[1], path))
61
+ x, y = x[pathx], y[pathy]
62
+ frames_tot = x.shape[0] # length of pairs
63
+
64
+ z = x - y
65
+ min_cost_tot = np.sqrt((z * z).sum(-1)).sum()
66
+
67
+ return frames_tot, min_cost_tot
68
+
69
+ # extract acoustic features
70
+ # alpha = 0.65 # commonly used at 22050 Hz
71
+ def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512):
72
+
73
+ # Use WORLD vocoder to spectral envelope
74
+ _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate,
75
+ frame_period=self.FRAME_PERIOD, fft_size=fft_size)
76
+ # Extract MCEP features
77
+ mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0,
78
+ etype=1, eps=1.0E-8, min_det=0.0, itype=3)
79
+
80
+ return mcep
81
+
82
+ # calculate the Mel-Cepstral Distortion (MCD) value
83
+ #def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode):
84
+ def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate):
85
+ """
86
+ Calculate the average MCD.
87
+ :param ref_mcep_files: list of strings, paths to MCEP target reference files
88
+ :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
89
+ :param cost_function: distance metric used
90
+ :param plain: if plain=True, use Dynamic Time Warping (dtw)
91
+ :returns: average MCD, total frames processed
92
+ """
93
+ # load wav from given wav file
94
+ #loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE)
95
+ #loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE)
96
+
97
+ if MCD_mode == "plain":
98
+ # pad 0
99
+ if len(loaded_ref_wav)<len(loaded_syn_wav):
100
+ loaded_ref_wav = np.pad(loaded_ref_wav, (0, len(loaded_syn_wav)-len(loaded_ref_wav)))
101
+ else:
102
+ loaded_syn_wav = np.pad(loaded_syn_wav, (0, len(loaded_ref_wav)-len(loaded_syn_wav)))
103
+
104
+ # extract MCEP features (vectors): 2D matrix (num x mcep_size)
105
+ ref_mcep_vec = self.wav2mcep_numpy(loaded_ref_wav, score_rate)
106
+ syn_mcep_vec = self.wav2mcep_numpy(loaded_syn_wav, score_rate)
107
+
108
+ if MCD_mode == "plain":
109
+ # print("Calculate plain MCD ...")
110
+ path = []
111
+ # for i in range(num_temp):
112
+ for i in range(len(ref_mcep_vec)):
113
+ path.append((i, i))
114
+ elif MCD_mode == "dtw":
115
+ # print("Calculate MCD-dtw ...")
116
+ _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
117
+ elif MCD_mode == "dtw_sl":
118
+ # print("Calculate MCD-dtw-sl ...")
119
+ cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
120
+ _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
121
+
122
+ frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)
123
+
124
+ if MCD_mode == "dtw_sl":
125
+ mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot
126
+ else:
127
+ mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot
128
+
129
+ return mean_mcd
130
+
131
+ # calculate mcd
132
+ def calculate_mcd(self, reference_audio, synthesized_audio, score_rate):
133
+ # extract acoustic features
134
+ mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate)
135
+
136
+ return mean_mcd
scores/mosnet/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def load(window, hop=None):
2
+ import tensorflow as tf
3
+ from .model import MOSNet
4
+ tf.debugging.set_log_device_placement(False)
5
+ # set memory growth
6
+ gpus = tf.config.experimental.list_physical_devices('GPU')
7
+ if gpus:
8
+ try:
9
+ # Currently, memory growth needs to be the same across GPUs
10
+ for gpu in gpus:
11
+ tf.config.experimental.set_memory_growth(gpu, True)
12
+
13
+ logical_gpus = tf.config.experimental.list_logical_devices('GPU')
14
+ print(len(gpus), "Physical GPUs,",
15
+ len(logical_gpus), "Logical GPUs")
16
+ except RuntimeError as e:
17
+ # Memory growth must be set before GPUs have been initialized
18
+ print(e)
19
+
20
+ mosnet = MOSNet(window, hop)
21
+ return mosnet
scores/mosnet/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (789 Bytes). View file
 
scores/mosnet/cnn_blstm.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b75e7d76ee6074ea7d57dcffa56d0c90be9d3d8dedc2217e25e259423cb756
3
+ size 14248464