Spaces:
Runtime error
Runtime error
import os | |
import math | |
import mido | |
import pumpp | |
import numpy as np | |
from scipy.ndimage import gaussian_filter1d | |
from cq2m_models import mask_voas_cnn_model, late_deep_cnn_model | |
############################################################ | |
def downsample_bins(voice): | |
voice_0 = np.array(voice.T[0::5]).T | |
voice_1 = np.array(voice.T[1::5]).T | |
voice_2 = np.array(voice.T[2::5]).T | |
voice_3 = np.array(voice.T[3::5]).T | |
voice_4 = np.array(voice.T[4::5]).T | |
voice_0 = voice_0.T[1:70].T | |
voice_1 = voice_1.T[1:70].T | |
voice_2 = voice_2.T[1:70].T | |
voice_3 = voice_3.T[0:69].T | |
voice_4 = voice_4.T[0:69].T | |
voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4 | |
voice_argmax = np.argmax(voice_sums, axis=1) | |
threshold = np.zeros(voice_sums.shape) | |
threshold[np.arange(voice_argmax.size), voice_argmax] = 1 | |
threshold[:, 0] = 0 | |
voice_sums = threshold | |
return voice_sums | |
############################################################ | |
def create_midi(pr, write_path='./midi_track.mid', ticks_per_beat=58, | |
tempo=90, save_to_file=True, program=53, channel=0): | |
def pr_to_list(pr): | |
# List event = (pitch, velocity, time) | |
T, N = pr.shape | |
t_last = 0 | |
pr_tm1 = np.zeros(N) | |
list_event = [] | |
for t in range(T): | |
pr_t = pr[t] | |
mask = (pr_t != pr_tm1) | |
if(N == 360): range_step = 5 | |
else: range_step = 1 | |
if (mask).any(): | |
for n in range(0, N): | |
if mask[n]: | |
if(N <= 72): | |
pitch = 25 + n | |
else: | |
pitch = 24 + round(n/5) | |
if int(pr_t[n] * 127) >= 50: | |
velocity = 127 | |
else: | |
velocity = 0 | |
# Time is incremented since last event | |
t_event = t - t_last | |
t_last = t | |
list_event.append((pitch, velocity, t_event)) | |
pr_tm1 = pr_t | |
list_event.append((0, 0, T - t_last)) | |
return list_event | |
# Tempo | |
microseconds_per_beat = mido.bpm2tempo(tempo) | |
# Write a pianoroll in a midi file | |
mid = mido.MidiFile() | |
mid.ticks_per_beat = ticks_per_beat | |
# Add a new track with the instrument name to the midi file | |
track = mid.add_track("Voice Aah") | |
# transform the matrix in a list of (pitch, velocity, time) | |
events = pr_to_list(pr) | |
#print(events) | |
# Tempo | |
track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat)) | |
track.append(mido.MetaMessage('channel_prefix', channel=channel)) | |
# Add the program_change | |
#Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55 | |
track.append(mido.Message('program_change', program=program, channel=channel)) | |
# This list is required to shut down | |
# notes that are on, intensity modified, then off only 1 time | |
# Example : | |
# (60,20,0) | |
# (60,40,10) | |
# (60,0,15) | |
notes_on_list = [] | |
# Write events in the midi file | |
for event in events: | |
pitch, velocity, time = event | |
if velocity == 0: | |
# Get the channel | |
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) | |
if(pitch in notes_on_list): | |
notes_on_list.remove(pitch) | |
else: | |
if pitch in notes_on_list: | |
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) | |
notes_on_list.remove(pitch) | |
time = 0 | |
track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel)) | |
notes_on_list.append(pitch) | |
if save_to_file: | |
mid.save(write_path) | |
return mid | |
############################################################ | |
def song_to_midi(sop, alto, ten, bass): | |
down_sop = downsample_bins(sop.T) | |
down_alto = downsample_bins(alto.T) | |
down_ten = downsample_bins(ten.T) | |
down_bass = downsample_bins(bass.T) | |
mid_sop = create_midi(down_sop, save_to_file=False, program=52, channel=0) | |
mid_alto = create_midi(down_alto, save_to_file=False, program=53, channel=1) | |
mid_ten = create_midi(down_ten, save_to_file=False, program=49, channel=2) | |
mid_bass = create_midi(down_bass, save_to_file=False, program=50, channel=3) | |
mid_mix = mido.MidiFile() | |
mid_mix.ticks_per_beat = mid_sop.ticks_per_beat | |
mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks | |
mid_mix.save('./result.mid') | |
return mid_mix | |
############################################################ | |
def prediction_postproc(input_array, argmax_and_threshold=True, gaussian_blur=True): | |
prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1) | |
if(argmax_and_threshold): | |
prediction = np.argmax(prediction, axis=0) | |
prediction = np.array([i if i <= 357 else 0 for i in prediction]) | |
threshold = np.zeros((360, prediction.shape[0])) | |
threshold[prediction, np.arange(prediction.size)] = 1 | |
prediction = threshold | |
if(gaussian_blur): | |
prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap')) | |
prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction)) | |
return prediction | |
############################################################ | |
def get_hcqt_params(): | |
bins_per_octave = 60 | |
n_octaves = 6 | |
over_sample = 5 | |
harmonics = [1, 2, 3, 4, 5] | |
sr = 22050 | |
fmin = 32.7 | |
hop_length = 256 | |
return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample | |
############################################################ | |
def create_pump_object(): | |
(bins_per_octave, n_octaves, harmonics, | |
sr, f_min, hop_length, over_sample) = get_hcqt_params() | |
p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length, | |
fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True) | |
pump = pumpp.Pump(p_phdif) | |
return pump | |
############################################################ | |
def compute_pump_features(pump, audio_fpath): | |
data = pump(audio_f=audio_fpath) | |
return data | |
############################################################ | |
def get_mpe_prediction(model, audio_file=None): | |
"""Generate output from a model given an input numpy file. | |
Part of this function is part of deepsalience | |
""" | |
split_value = 2500 | |
if audio_file is not None: | |
pump = create_pump_object() | |
features = compute_pump_features(pump, audio_file) | |
input_hcqt = features['dphase/mag'][0] | |
input_dphase = features['dphase/dphase'][0] | |
else: | |
raise ValueError("One audio_file must be specified") | |
input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :] | |
input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :] | |
n_t = input_hcqt.shape[3] | |
t_slices = list(np.arange(0, n_t, split_value)) | |
output_list = [] | |
for t in t_slices: | |
p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)), | |
np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))] | |
)[0, :, :] | |
output_list.append(p) | |
predicted_output = np.hstack(output_list).astype(np.float32) | |
return predicted_output | |
############################################################ | |
def get_va_prediction(model, f0_matrix): | |
splits = f0_matrix.shape[1]//256 | |
splits_diff = 256 - (f0_matrix.shape[1] - splits * 256) | |
fill = np.zeros((360, splits_diff)) | |
mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1) | |
mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3)) | |
batches = math.ceil(mix_filled.shape[0]/24) | |
s_pred_result = np.zeros((0, 360, 256)) | |
a_pred_result = np.zeros((0, 360, 256)) | |
t_pred_result = np.zeros((0, 360, 256)) | |
b_pred_result = np.zeros((0, 360, 256)) | |
for i in range(batches): | |
s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24]) | |
s_pred_result = np.append(s_pred_result, s_pred, axis=0) | |
a_pred_result = np.append(a_pred_result, a_pred, axis=0) | |
t_pred_result = np.append(t_pred_result, t_pred, axis=0) | |
b_pred_result = np.append(b_pred_result, b_pred, axis=0) | |
s_pred_result = prediction_postproc(s_pred_result)[:, :f0_matrix.shape[1]] | |
a_pred_result = prediction_postproc(a_pred_result)[:, :f0_matrix.shape[1]] | |
t_pred_result = prediction_postproc(t_pred_result)[:, :f0_matrix.shape[1]] | |
b_pred_result = prediction_postproc(b_pred_result)[:, :f0_matrix.shape[1]] | |
return s_pred_result, a_pred_result, t_pred_result, b_pred_result | |
############################################################ | |
def cq2m(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_model()): | |
mpe_pred = get_mpe_prediction(mpe, audiofile) | |
s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred) | |
midi = song_to_midi(s_pred, a_pred, t_pred, b_pred) | |
return midi | |
############################################################ | |