from huggingface_hub import from_pretrained_keras import numpy as np import tensorflow as tf from tensorflow.keras import layers import tensorflow_io as tfio import gradio as gr import librosa import librosa.display import matplotlib.pyplot as plt class MelSpec(layers.Layer): def __init__( self, frame_length=1024, frame_step=256, fft_length=None, sampling_rate=22050, num_mel_channels=80, freq_min=125, freq_max=7600, **kwargs, ): super().__init__(**kwargs) self.frame_length = frame_length self.frame_step = frame_step self.fft_length = fft_length self.sampling_rate = sampling_rate self.num_mel_channels = num_mel_channels self.freq_min = freq_min self.freq_max = freq_max self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=self.num_mel_channels, num_spectrogram_bins=self.frame_length // 2 + 1, sample_rate=self.sampling_rate, lower_edge_hertz=self.freq_min, upper_edge_hertz=self.freq_max, ) def call(self, audio): stft = tf.signal.stft( tf.squeeze(audio, -1), self.frame_length, self.frame_step, self.fft_length, pad_end=True, ) # Taking the magnitude of the STFT output magnitude = tf.abs(stft) # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale mel = tf.matmul(tf.square(magnitude), self.mel_filterbank) log_mel_spec = tfio.audio.dbscale(mel, top_db=80) return log_mel_spec def get_config(self): config = super(MelSpec, self).get_config() config.update( { "frame_length": self.frame_length, "frame_step": self.frame_step, "fft_length": self.fft_length, "sampling_rate": self.sampling_rate, "num_mel_channels": self.num_mel_channels, "freq_min": self.freq_min, "freq_max": self.freq_max, } ) return config model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion") def inference(audio, model): input, sr = librosa.load(audio) # input, sr = audio x = tf.expand_dims(input, axis=-1) mel = MelSpec()(x) audio_sample = tf.expand_dims(mel, axis=0) pred = model.predict(audio_sample, batch_size=1, verbose=0) return input, pred.squeeze(), sr def predict(audio): x, x_pred, sr = inference(audio, model) fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120) D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max) img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr, ax=ax[0]) ax[0].set(title='Spectrogram of Original sample audio') ax[0].label_outer() D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max) img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr, ax=ax[1]) ax[1].set(title='Spectrogram of synthesis sample audio ') ax[1].label_outer() return plt.gcf() inputs = [ gr.Audio(source = "upload", label='Upload audio file', type="filepath"), ] examples = ["sample_1.wav"] gr.Interface( fn=predict, title="MelGAN-based spectrogram inversion", description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching", inputs=inputs, examples=examples, outputs=gr.Plot(), cache_examples=False, article = "Author: Vu Minh Chien. Based on the keras example from Darshan Deshpande", ).launch(debug=False, enable_queue=True)