Xornotor's picture
0.2.0-beta - fixed MIDI instrument
9d0584c
raw
history blame
11.8 kB
import os
import math
import mido
import pumpp
import librosa
import numpy as np
import pandas as pd
from copy import deepcopy
from scipy.ndimage import gaussian_filter1d
from cqfe_models import mask_voas_cnn_v2_model, late_deep_cnn_model
SATB_THRESHOLDS = [0.23, 0.17, 0.15, 0.17]
############################################################
freqscale = librosa.cqt_frequencies(n_bins=360, fmin=32.7, bins_per_octave=60)
def bin_to_freq(bin):
return freqscale[bin]
vec_bin_to_freq = np.vectorize(bin_to_freq)
############################################################
def downsample_bins(voice):
voice_0 = np.array(voice.T[0::5]).T
voice_1 = np.array(voice.T[1::5]).T
voice_2 = np.array(voice.T[2::5]).T
voice_3 = np.array(voice.T[3::5]).T
voice_4 = np.array(voice.T[4::5]).T
voice_0 = voice_0.T[1:70].T
voice_1 = voice_1.T[1:70].T
voice_2 = voice_2.T[1:70].T
voice_3 = voice_3.T[0:69].T
voice_4 = voice_4.T[0:69].T
voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
voice_argmax = np.argmax(voice_sums, axis=1)
threshold = np.zeros(voice_sums.shape)
threshold[np.arange(voice_argmax.size), voice_argmax] = 1
threshold[:, 0] = 0
voice_sums = threshold
return voice_sums
############################################################
def bin_matrix_to_freq(matrix):
s_freqs = vec_bin_to_freq(np.argmax(matrix[0], axis=0)).reshape(-1, 1)
a_freqs = vec_bin_to_freq(np.argmax(matrix[1], axis=0)).reshape(-1, 1)
t_freqs = vec_bin_to_freq(np.argmax(matrix[2], axis=0)).reshape(-1, 1)
b_freqs = vec_bin_to_freq(np.argmax(matrix[3], axis=0)).reshape(-1, 1)
freqs = np.concatenate((s_freqs, a_freqs, t_freqs, b_freqs), axis=1).T
return freqs
############################################################
def create_midi(freq, write_path='./midi_track.mid', ticks_per_beat=58,
tempo=90, save_to_file=True, program=53, channel=0):
def freq_to_list(freq):
# List event = (pitch, velocity, time)
T = freq.shape[0]
#midi_freqs = np.squeeze(midi_freqs)
midi_freqs = np.round(69 + 12*np.log2(freq/440)).squeeze().astype('int')
t_last = 0
pitch_tm1 = 20
list_event = []
for t in range(T):
pitch_t = midi_freqs[t]
if (pitch_t != pitch_tm1):
velocity = 127
if(pitch_t == 24):
pitch_t = 0
velocity = 0
t_event = t - t_last
t_last = t
list_event.append((pitch_tm1, 0, t_event))
list_event.append((pitch_t, velocity, 0))
pitch_tm1 = pitch_t
list_event.append((pitch_tm1, 0, T - t_last))
return list_event
# Tempo
microseconds_per_beat = mido.bpm2tempo(tempo)
# Write a pianoroll in a midi file
mid = mido.MidiFile()
mid.ticks_per_beat = ticks_per_beat
# Add a new track with the instrument name to the midi file
track = mid.add_track("Voice Aah")
# transform the matrix in a list of (pitch, velocity, time)
events = freq_to_list(freq)
#print(events)
# Tempo
track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
track.append(mido.MetaMessage('channel_prefix', channel=channel))
# Add the program_change
#Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
track.append(mido.Message('program_change', program=program, channel=channel))
# This list is required to shut down
# notes that are on, intensity modified, then off only 1 time
# Example :
# (60,20,0)
# (60,40,10)
# (60,0,15)
notes_on_list = []
# Write events in the midi file
for event in events:
pitch, velocity, time = event
if velocity == 0:
# Get the channel
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
if(pitch in notes_on_list):
notes_on_list.remove(pitch)
else:
if pitch in notes_on_list:
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
notes_on_list.remove(pitch)
time = 0
track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
notes_on_list.append(pitch)
if save_to_file:
mid.save(write_path)
return mid
############################################################
def song_to_midi(sop, alto, ten, bass):
savepath = './output.mid'
bin_matrix = np.array([sop, alto, ten, bass])
freq_matrix = bin_matrix_to_freq(bin_matrix)
mid_sop = create_midi(freq_matrix[0], save_to_file=False, program=52, channel=0)
mid_alto = create_midi(freq_matrix[1], save_to_file=False, program=52, channel=1)
mid_ten = create_midi(freq_matrix[2], save_to_file=False, program=52, channel=2)
mid_bass = create_midi(freq_matrix[3], save_to_file=False, program=52, channel=3)
mid_mix = mido.MidiFile()
mid_mix.ticks_per_beat=mid_sop.ticks_per_beat
mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
mid_mix.save(savepath)
return savepath
############################################################
def song_to_dataframe(sop, alto, ten, bass):
timescale = np.arange(0, 0.011609977 * (sop.shape[1]), 0.011609977)[:sop.shape[1]]
s_argmax = vec_bin_to_freq(np.argmax(sop, axis=0))
a_argmax = vec_bin_to_freq(np.argmax(alto, axis=0))
t_argmax = vec_bin_to_freq(np.argmax(ten, axis=0))
b_argmax = vec_bin_to_freq(np.argmax(bass, axis=0))
data = np.array([timescale, s_argmax, a_argmax, t_argmax, b_argmax], dtype=np.float32).T
columns = ['Timestep', 'Soprano', 'Alto', 'Tenor', 'Bass']
df = pd.DataFrame(data, columns=columns)
return df
############################################################
def prediction_postproc(input_array, argmax_and_threshold=True,
gaussian_blur=True,
threshold_value=0):
prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
thres_reference = deepcopy(prediction)
if(argmax_and_threshold):
prediction = np.argmax(prediction, axis=0)
prediction = np.array([prediction[i] if thres_reference[prediction[i], i] >= threshold_value else 0 for i in np.arange(prediction.size)])
threshold = np.zeros((360, prediction.shape[0]))
threshold[prediction, np.arange(prediction.size)] = 1
prediction = threshold
if(gaussian_blur):
prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
return prediction
############################################################
def get_hcqt_params():
bins_per_octave = 60
n_octaves = 6
over_sample = 5
harmonics = [1, 2, 3, 4, 5]
sr = 22050
fmin = 32.7
hop_length = 256
return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample
############################################################
def create_pump_object():
(bins_per_octave, n_octaves, harmonics,
sr, f_min, hop_length, over_sample) = get_hcqt_params()
p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)
pump = pumpp.Pump(p_phdif)
return pump
############################################################
def compute_pump_features(pump, audio_fpath):
data = pump(audio_f=audio_fpath)
return data
############################################################
def get_mpe_prediction(model, audio_file=None):
"""Generate output from a model given an input numpy file.
Part of this function is part of deepsalience
"""
split_value = 4000
if audio_file is not None:
pump = create_pump_object()
features = compute_pump_features(pump, audio_file)
input_hcqt = features['dphase/mag'][0]
input_dphase = features['dphase/dphase'][0]
else:
raise ValueError("One audio_file must be specified")
input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]
n_t = input_hcqt.shape[3]
t_slices = list(np.arange(0, n_t, split_value))
output_list = []
for t in t_slices:
p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
)[0, :, :]
output_list.append(p)
predicted_output = np.hstack(output_list).astype(np.float32)
return predicted_output
############################################################
def get_va_prediction(model, f0_matrix):
splits = f0_matrix.shape[1]//256
splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
fill = np.zeros((360, splits_diff))
mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
batches = math.ceil(mix_filled.shape[0]/24)
s_pred_result = np.zeros((0, 360, 256))
a_pred_result = np.zeros((0, 360, 256))
t_pred_result = np.zeros((0, 360, 256))
b_pred_result = np.zeros((0, 360, 256))
for i in range(batches):
s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
s_pred_result = np.append(s_pred_result, s_pred, axis=0)
a_pred_result = np.append(a_pred_result, a_pred, axis=0)
t_pred_result = np.append(t_pred_result, t_pred, axis=0)
b_pred_result = np.append(b_pred_result, b_pred, axis=0)
s_pred_result = prediction_postproc(s_pred_result, threshold_value=SATB_THRESHOLDS[0])[:, :f0_matrix.shape[1]]
a_pred_result = prediction_postproc(a_pred_result, threshold_value=SATB_THRESHOLDS[1])[:, :f0_matrix.shape[1]]
t_pred_result = prediction_postproc(t_pred_result, threshold_value=SATB_THRESHOLDS[2])[:, :f0_matrix.shape[1]]
b_pred_result = prediction_postproc(b_pred_result, threshold_value=SATB_THRESHOLDS[3])[:, :f0_matrix.shape[1]]
return s_pred_result, a_pred_result, t_pred_result, b_pred_result
############################################################
def cqfe(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_v2_model()):
savepath_csv = './output.csv'
savepath_hdf5 = './output.hdf5'
mpe_pred = get_mpe_prediction(mpe, audiofile)
s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)
output_midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)
output_df = song_to_dataframe(s_pred, a_pred, t_pred, b_pred)
output_df.to_csv(savepath_csv, mode='w', header=True)
output_df.to_hdf(savepath_hdf5, key='F0', mode='w', complevel=9, complib='blosc', append=False, format='table')
ax1 = output_df.plot.scatter(x='Timestep', y='Bass', s=1, color='#2f29e3', label='Bass')
ax2 = output_df.plot.scatter(x='Timestep', y='Tenor', s=1, color='#e36129', label='Tenor', ax=ax1)
ax3 = output_df.plot.scatter(x='Timestep', y='Alto', s=1, color='#29e35a', label='Alto', ax=ax1)
ax4 = output_df.plot.scatter(x='Timestep', y='Soprano', s=1, color='#d3d921', label='Soprano', ax=ax1)
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Freq (Hz)')
fig = ax1.get_figure()
fig.set_dpi(150)
return [output_midi, savepath_csv, savepath_hdf5], fig
############################################################