|
import os |
|
import math |
|
import mido |
|
import pumpp |
|
import numpy as np |
|
from scipy.ndimage import gaussian_filter1d |
|
from cq2m_models import mask_voas_cnn_model, late_deep_cnn_model |
|
|
|
|
|
|
|
def downsample_bins(voice): |
|
voice_0 = np.array(voice.T[0::5]).T |
|
voice_1 = np.array(voice.T[1::5]).T |
|
voice_2 = np.array(voice.T[2::5]).T |
|
voice_3 = np.array(voice.T[3::5]).T |
|
voice_4 = np.array(voice.T[4::5]).T |
|
|
|
voice_0 = voice_0.T[1:70].T |
|
voice_1 = voice_1.T[1:70].T |
|
voice_2 = voice_2.T[1:70].T |
|
voice_3 = voice_3.T[0:69].T |
|
voice_4 = voice_4.T[0:69].T |
|
|
|
voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4 |
|
voice_argmax = np.argmax(voice_sums, axis=1) |
|
threshold = np.zeros(voice_sums.shape) |
|
threshold[np.arange(voice_argmax.size), voice_argmax] = 1 |
|
threshold[:, 0] = 0 |
|
voice_sums = threshold |
|
|
|
return voice_sums |
|
|
|
|
|
|
|
def create_midi(pr, write_path='./midi_track.mid', ticks_per_beat=58, |
|
tempo=90, save_to_file=True, program=53, channel=0): |
|
|
|
def pr_to_list(pr): |
|
|
|
T, N = pr.shape |
|
t_last = 0 |
|
pr_tm1 = np.zeros(N) |
|
list_event = [] |
|
for t in range(T): |
|
pr_t = pr[t] |
|
mask = (pr_t != pr_tm1) |
|
if(N == 360): range_step = 5 |
|
else: range_step = 1 |
|
if (mask).any(): |
|
for n in range(0, N): |
|
if mask[n]: |
|
if(N <= 72): |
|
pitch = 25 + n |
|
else: |
|
pitch = 24 + round(n/5) |
|
if int(pr_t[n] * 127) >= 50: |
|
velocity = 127 |
|
else: |
|
velocity = 0 |
|
|
|
t_event = t - t_last |
|
t_last = t |
|
list_event.append((pitch, velocity, t_event)) |
|
pr_tm1 = pr_t |
|
list_event.append((0, 0, T - t_last)) |
|
return list_event |
|
|
|
microseconds_per_beat = mido.bpm2tempo(tempo) |
|
|
|
mid = mido.MidiFile() |
|
mid.ticks_per_beat = ticks_per_beat |
|
|
|
|
|
|
|
track = mid.add_track("Voice Aah") |
|
|
|
events = pr_to_list(pr) |
|
|
|
|
|
track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat)) |
|
track.append(mido.MetaMessage('channel_prefix', channel=channel)) |
|
|
|
|
|
track.append(mido.Message('program_change', program=program, channel=channel)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notes_on_list = [] |
|
|
|
for event in events: |
|
pitch, velocity, time = event |
|
if velocity == 0: |
|
|
|
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) |
|
if(pitch in notes_on_list): |
|
notes_on_list.remove(pitch) |
|
else: |
|
if pitch in notes_on_list: |
|
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) |
|
notes_on_list.remove(pitch) |
|
time = 0 |
|
track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel)) |
|
notes_on_list.append(pitch) |
|
if save_to_file: |
|
mid.save(write_path) |
|
return mid |
|
|
|
|
|
|
|
def song_to_midi(sop, alto, ten, bass): |
|
|
|
savepath = './output.mid' |
|
|
|
down_sop = downsample_bins(sop.T) |
|
down_alto = downsample_bins(alto.T) |
|
down_ten = downsample_bins(ten.T) |
|
down_bass = downsample_bins(bass.T) |
|
|
|
mid_sop = create_midi(down_sop, save_to_file=False, program=52, channel=0) |
|
mid_alto = create_midi(down_alto, save_to_file=False, program=53, channel=1) |
|
mid_ten = create_midi(down_ten, save_to_file=False, program=49, channel=2) |
|
mid_bass = create_midi(down_bass, save_to_file=False, program=50, channel=3) |
|
|
|
mid_mix = mido.MidiFile() |
|
mid_mix.ticks_per_beat = mid_sop.ticks_per_beat |
|
mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks |
|
mid_mix.save(savepath) |
|
|
|
return savepath |
|
|
|
|
|
|
|
def prediction_postproc(input_array, argmax_and_threshold=True, gaussian_blur=True): |
|
prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1) |
|
if(argmax_and_threshold): |
|
prediction = np.argmax(prediction, axis=0) |
|
prediction = np.array([i if i <= 357 else 0 for i in prediction]) |
|
threshold = np.zeros((360, prediction.shape[0])) |
|
threshold[prediction, np.arange(prediction.size)] = 1 |
|
prediction = threshold |
|
if(gaussian_blur): |
|
prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap')) |
|
prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction)) |
|
return prediction |
|
|
|
|
|
|
|
def get_hcqt_params(): |
|
|
|
bins_per_octave = 60 |
|
n_octaves = 6 |
|
over_sample = 5 |
|
harmonics = [1, 2, 3, 4, 5] |
|
sr = 22050 |
|
fmin = 32.7 |
|
hop_length = 256 |
|
|
|
return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample |
|
|
|
|
|
|
|
def create_pump_object(): |
|
|
|
(bins_per_octave, n_octaves, harmonics, |
|
sr, f_min, hop_length, over_sample) = get_hcqt_params() |
|
|
|
p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length, |
|
fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True) |
|
|
|
pump = pumpp.Pump(p_phdif) |
|
|
|
return pump |
|
|
|
|
|
|
|
def compute_pump_features(pump, audio_fpath): |
|
|
|
data = pump(audio_f=audio_fpath) |
|
|
|
return data |
|
|
|
|
|
|
|
def get_mpe_prediction(model, audio_file=None): |
|
"""Generate output from a model given an input numpy file. |
|
Part of this function is part of deepsalience |
|
""" |
|
|
|
split_value = 2500 |
|
|
|
if audio_file is not None: |
|
|
|
pump = create_pump_object() |
|
features = compute_pump_features(pump, audio_file) |
|
input_hcqt = features['dphase/mag'][0] |
|
input_dphase = features['dphase/dphase'][0] |
|
|
|
else: |
|
raise ValueError("One audio_file must be specified") |
|
|
|
input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :] |
|
input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :] |
|
|
|
n_t = input_hcqt.shape[3] |
|
t_slices = list(np.arange(0, n_t, split_value)) |
|
output_list = [] |
|
|
|
for t in t_slices: |
|
p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)), |
|
np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))] |
|
)[0, :, :] |
|
|
|
output_list.append(p) |
|
|
|
predicted_output = np.hstack(output_list).astype(np.float32) |
|
return predicted_output |
|
|
|
|
|
|
|
def get_va_prediction(model, f0_matrix): |
|
splits = f0_matrix.shape[1]//256 |
|
splits_diff = 256 - (f0_matrix.shape[1] - splits * 256) |
|
fill = np.zeros((360, splits_diff)) |
|
mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1) |
|
mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3)) |
|
batches = math.ceil(mix_filled.shape[0]/24) |
|
|
|
s_pred_result = np.zeros((0, 360, 256)) |
|
a_pred_result = np.zeros((0, 360, 256)) |
|
t_pred_result = np.zeros((0, 360, 256)) |
|
b_pred_result = np.zeros((0, 360, 256)) |
|
|
|
for i in range(batches): |
|
s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24]) |
|
s_pred_result = np.append(s_pred_result, s_pred, axis=0) |
|
a_pred_result = np.append(a_pred_result, a_pred, axis=0) |
|
t_pred_result = np.append(t_pred_result, t_pred, axis=0) |
|
b_pred_result = np.append(b_pred_result, b_pred, axis=0) |
|
|
|
s_pred_result = prediction_postproc(s_pred_result)[:, :f0_matrix.shape[1]] |
|
a_pred_result = prediction_postproc(a_pred_result)[:, :f0_matrix.shape[1]] |
|
t_pred_result = prediction_postproc(t_pred_result)[:, :f0_matrix.shape[1]] |
|
b_pred_result = prediction_postproc(b_pred_result)[:, :f0_matrix.shape[1]] |
|
|
|
return s_pred_result, a_pred_result, t_pred_result, b_pred_result |
|
|
|
|
|
|
|
def cq2m(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_model()): |
|
mpe_pred = get_mpe_prediction(mpe, audiofile) |
|
s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred) |
|
midi = song_to_midi(s_pred, a_pred, t_pred, b_pred) |
|
return midi |
|
|
|
|
|
|