Spaces:
Running
Running
File size: 4,010 Bytes
936f6fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from tensorflow import keras
from tensorflow.keras import Model, layers
from tensorflow.keras.layers import Dense, Dropout, Conv2D
from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional
from tensorflow.keras.constraints import max_norm
import librosa
import scipy
import numpy as np
import os
from ... import Metric
# prevent TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class MOSNet(Metric):
def __init__(self, window, hop=None):
super(MOSNet, self).__init__(name='MOSNet', window=window, hop=hop)
# constants
self.fixed_rate = 16000
self.mono = True
self.absolute = True
self.FFT_SIZE = 512
self.SGRAM_DIM = self.FFT_SIZE // 2 + 1
self.HOP_LENGTH = 256
self.WIN_LENGTH = 512
_input = keras.Input(shape=(None, 257))
re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input)
# CNN
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(re_input)
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv1)
conv1 = (Conv2D(16, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv1)
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv1)
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv2)
conv2 = (Conv2D(32, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv2)
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv2)
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv3)
conv3 = (Conv2D(64, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv3)
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv3)
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv4)
conv4 = (Conv2D(128, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv4)
re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4)
# BLSTM
blstm1 = Bidirectional(
LSTM(128, return_sequences=True, dropout=0.3,
recurrent_dropout=0.3,
recurrent_constraint=max_norm(0.00001)),
merge_mode='concat')(re_shape)
# DNN
flatten = TimeDistributed(layers.Flatten())(blstm1)
dense1 = TimeDistributed(Dense(128, activation='relu'))(flatten)
dense1 = Dropout(0.3)(dense1)
frame_score = TimeDistributed(Dense(1), name='frame')(dense1)
import warnings
average_score = layers.GlobalAveragePooling1D(name='avg')(frame_score)
self.model = Model(outputs=[average_score, frame_score], inputs=_input)
# weights are in the directory of this file
pre_trained_dir = os.path.dirname(__file__)
# load pre-trained weights. CNN_BLSTM is reported as best
self.model.load_weights(os.path.join(pre_trained_dir, 'cnn_blstm.h5'))
def test_window(self, audios, rate):
# stft. D: (1+n_fft//2, T)
linear = librosa.stft(y=np.asfortranarray(audios[0]),
n_fft=self.FFT_SIZE,
hop_length=self.HOP_LENGTH,
win_length=self.WIN_LENGTH,
window=scipy.signal.hamming,
)
# magnitude spectrogram
mag = np.abs(linear) # (1+n_fft/2, T)
# shape in (T, 1+n_fft/2)
mag = np.transpose(mag.astype(np.float32))
# now call the actual MOSnet
return {'mosnet':
self.model.predict(mag[None, ...], verbose=0, batch_size=1)[0]}
|