import os import math import mido import pumpp import numpy as np from scipy.ndimage import gaussian_filter1d from cq2m_models import mask_voas_cnn_model, late_deep_cnn_model ############################################################ def downsample_bins(voice): voice_0 = np.array(voice.T[0::5]).T voice_1 = np.array(voice.T[1::5]).T voice_2 = np.array(voice.T[2::5]).T voice_3 = np.array(voice.T[3::5]).T voice_4 = np.array(voice.T[4::5]).T voice_0 = voice_0.T[1:70].T voice_1 = voice_1.T[1:70].T voice_2 = voice_2.T[1:70].T voice_3 = voice_3.T[0:69].T voice_4 = voice_4.T[0:69].T voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4 voice_argmax = np.argmax(voice_sums, axis=1) threshold = np.zeros(voice_sums.shape) threshold[np.arange(voice_argmax.size), voice_argmax] = 1 threshold[:, 0] = 0 voice_sums = threshold return voice_sums ############################################################ def create_midi(pr, write_path='./midi_track.mid', ticks_per_beat=58, tempo=90, save_to_file=True, program=53, channel=0): def pr_to_list(pr): # List event = (pitch, velocity, time) T, N = pr.shape t_last = 0 pr_tm1 = np.zeros(N) list_event = [] for t in range(T): pr_t = pr[t] mask = (pr_t != pr_tm1) if(N == 360): range_step = 5 else: range_step = 1 if (mask).any(): for n in range(0, N): if mask[n]: if(N <= 72): pitch = 25 + n else: pitch = 24 + round(n/5) if int(pr_t[n] * 127) >= 50: velocity = 127 else: velocity = 0 # Time is incremented since last event t_event = t - t_last t_last = t list_event.append((pitch, velocity, t_event)) pr_tm1 = pr_t list_event.append((0, 0, T - t_last)) return list_event # Tempo microseconds_per_beat = mido.bpm2tempo(tempo) # Write a pianoroll in a midi file mid = mido.MidiFile() mid.ticks_per_beat = ticks_per_beat # Add a new track with the instrument name to the midi file track = mid.add_track("Voice Aah") # transform the matrix in a list of (pitch, velocity, time) events = pr_to_list(pr) #print(events) # Tempo track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat)) track.append(mido.MetaMessage('channel_prefix', channel=channel)) # Add the program_change #Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55 track.append(mido.Message('program_change', program=program, channel=channel)) # This list is required to shut down # notes that are on, intensity modified, then off only 1 time # Example : # (60,20,0) # (60,40,10) # (60,0,15) notes_on_list = [] # Write events in the midi file for event in events: pitch, velocity, time = event if velocity == 0: # Get the channel track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) if(pitch in notes_on_list): notes_on_list.remove(pitch) else: if pitch in notes_on_list: track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel)) notes_on_list.remove(pitch) time = 0 track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel)) notes_on_list.append(pitch) if save_to_file: mid.save(write_path) return mid ############################################################ def song_to_midi(sop, alto, ten, bass): savepath = './output.mid' down_sop = downsample_bins(sop.T) down_alto = downsample_bins(alto.T) down_ten = downsample_bins(ten.T) down_bass = downsample_bins(bass.T) mid_sop = create_midi(down_sop, save_to_file=False, program=52, channel=0) mid_alto = create_midi(down_alto, save_to_file=False, program=53, channel=1) mid_ten = create_midi(down_ten, save_to_file=False, program=49, channel=2) mid_bass = create_midi(down_bass, save_to_file=False, program=50, channel=3) mid_mix = mido.MidiFile() mid_mix.ticks_per_beat = mid_sop.ticks_per_beat mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks mid_mix.save(savepath) return savepath ############################################################ def prediction_postproc(input_array, argmax_and_threshold=True, gaussian_blur=True): prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1) if(argmax_and_threshold): prediction = np.argmax(prediction, axis=0) prediction = np.array([i if i <= 357 else 0 for i in prediction]) threshold = np.zeros((360, prediction.shape[0])) threshold[prediction, np.arange(prediction.size)] = 1 prediction = threshold if(gaussian_blur): prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap')) prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction)) return prediction ############################################################ def get_hcqt_params(): bins_per_octave = 60 n_octaves = 6 over_sample = 5 harmonics = [1, 2, 3, 4, 5] sr = 22050 fmin = 32.7 hop_length = 256 return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample ############################################################ def create_pump_object(): (bins_per_octave, n_octaves, harmonics, sr, f_min, hop_length, over_sample) = get_hcqt_params() p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length, fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True) pump = pumpp.Pump(p_phdif) return pump ############################################################ def compute_pump_features(pump, audio_fpath): data = pump(audio_f=audio_fpath) return data ############################################################ def get_mpe_prediction(model, audio_file=None): """Generate output from a model given an input numpy file. Part of this function is part of deepsalience """ split_value = 2500 if audio_file is not None: pump = create_pump_object() features = compute_pump_features(pump, audio_file) input_hcqt = features['dphase/mag'][0] input_dphase = features['dphase/dphase'][0] else: raise ValueError("One audio_file must be specified") input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :] input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :] n_t = input_hcqt.shape[3] t_slices = list(np.arange(0, n_t, split_value)) output_list = [] for t in t_slices: p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)), np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))] )[0, :, :] output_list.append(p) predicted_output = np.hstack(output_list).astype(np.float32) return predicted_output ############################################################ def get_va_prediction(model, f0_matrix): splits = f0_matrix.shape[1]//256 splits_diff = 256 - (f0_matrix.shape[1] - splits * 256) fill = np.zeros((360, splits_diff)) mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1) mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3)) batches = math.ceil(mix_filled.shape[0]/24) s_pred_result = np.zeros((0, 360, 256)) a_pred_result = np.zeros((0, 360, 256)) t_pred_result = np.zeros((0, 360, 256)) b_pred_result = np.zeros((0, 360, 256)) for i in range(batches): s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24]) s_pred_result = np.append(s_pred_result, s_pred, axis=0) a_pred_result = np.append(a_pred_result, a_pred, axis=0) t_pred_result = np.append(t_pred_result, t_pred, axis=0) b_pred_result = np.append(b_pred_result, b_pred, axis=0) s_pred_result = prediction_postproc(s_pred_result)[:, :f0_matrix.shape[1]] a_pred_result = prediction_postproc(a_pred_result)[:, :f0_matrix.shape[1]] t_pred_result = prediction_postproc(t_pred_result)[:, :f0_matrix.shape[1]] b_pred_result = prediction_postproc(b_pred_result)[:, :f0_matrix.shape[1]] return s_pred_result, a_pred_result, t_pred_result, b_pred_result ############################################################ def cq2m(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_model()): mpe_pred = get_mpe_prediction(mpe, audiofile) s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred) midi = song_to_midi(s_pred, a_pred, t_pred, b_pred) return midi ############################################################