from huggingface_hub import from_pretrained_keras
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_io as tfio
import gradio as gr
import librosa
import librosa.display
import matplotlib.pyplot as plt
class MelSpec(layers.Layer):
def __init__(
self,
frame_length=1024,
frame_step=256,
fft_length=None,
sampling_rate=22050,
num_mel_channels=80,
freq_min=125,
freq_max=7600,
**kwargs,
):
super().__init__(**kwargs)
self.frame_length = frame_length
self.frame_step = frame_step
self.fft_length = fft_length
self.sampling_rate = sampling_rate
self.num_mel_channels = num_mel_channels
self.freq_min = freq_min
self.freq_max = freq_max
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=self.num_mel_channels,
num_spectrogram_bins=self.frame_length // 2 + 1,
sample_rate=self.sampling_rate,
lower_edge_hertz=self.freq_min,
upper_edge_hertz=self.freq_max,
)
def call(self, audio):
stft = tf.signal.stft(
tf.squeeze(audio, -1),
self.frame_length,
self.frame_step,
self.fft_length,
pad_end=True,
)
# Taking the magnitude of the STFT output
magnitude = tf.abs(stft)
# Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
return log_mel_spec
def get_config(self):
config = super(MelSpec, self).get_config()
config.update(
{
"frame_length": self.frame_length,
"frame_step": self.frame_step,
"fft_length": self.fft_length,
"sampling_rate": self.sampling_rate,
"num_mel_channels": self.num_mel_channels,
"freq_min": self.freq_min,
"freq_max": self.freq_max,
}
)
return config
model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion")
def inference(audio, model):
input, sr = librosa.load(audio)
# input, sr = audio
x = tf.expand_dims(input, axis=-1)
mel = MelSpec()(x)
audio_sample = tf.expand_dims(mel, axis=0)
pred = model.predict(audio_sample, batch_size=1, verbose=0)
return input, pred.squeeze(), sr
def predict(audio):
x, x_pred, sr = inference(audio, model)
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120)
D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
sr=sr, ax=ax[0])
ax[0].set(title='Spectrogram of Original sample audio')
ax[0].label_outer()
D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time',
sr=sr, ax=ax[1])
ax[1].set(title='Spectrogram of synthesis sample audio ')
ax[1].label_outer()
return plt.gcf()
inputs = [
gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
]
examples = ['sample_1.wav', 'sample_2.wav']
gr.Interface(
fn=predict,
title="MelGAN-based spectrogram inversion",
description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching",
inputs=inputs,
examples=examples,
outputs=gr.Plot(),
article = "Author: Vu Minh Chien. Based on the keras example from Darshan Deshpande",
).launch(debug=False, enable_queue=True)