File size: 11,792 Bytes
2b8a8b8
 
 
 
9c5c602
2b8a8b8
9c5c602
b519699
2b8a8b8
9c5c602
 
e857dc9
 
9c5c602
 
 
 
 
 
 
 
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993f635
 
 
 
 
 
 
 
 
 
2b8a8b8
993f635
 
 
 
2b8a8b8
993f635
 
 
2b8a8b8
993f635
2b8a8b8
 
993f635
 
 
 
 
 
 
 
 
 
 
 
2b8a8b8
 
 
 
 
 
 
 
 
 
 
993f635
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d7895a
 
993f635
 
2b8a8b8
993f635
e857dc9
9d0584c
e857dc9
2b8a8b8
 
993f635
2b8a8b8
3d7895a
2b8a8b8
3d7895a
2b8a8b8
 
 
b6dc2cb
9c5c602
 
 
 
 
 
 
 
 
 
 
 
 
b6dc2cb
9c5c602
 
 
e857dc9
 
 
2b8a8b8
e857dc9
2b8a8b8
 
e857dc9
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c5c602
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993f635
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e857dc9
 
 
 
2b8a8b8
 
 
 
 
9c5c602
b6dc2cb
 
 
 
2b8a8b8
 
b6dc2cb
9c5c602
b6dc2cb
 
 
 
 
 
 
 
 
 
 
3973eb2
b6dc2cb
3973eb2
2b8a8b8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import os
import math
import mido
import pumpp
import librosa
import numpy as np
import pandas as pd
from copy import deepcopy
from scipy.ndimage import gaussian_filter1d
from cqfe_models import mask_voas_cnn_v2_model, late_deep_cnn_model

SATB_THRESHOLDS = [0.23, 0.17, 0.15, 0.17]

############################################################

freqscale = librosa.cqt_frequencies(n_bins=360, fmin=32.7, bins_per_octave=60)

def bin_to_freq(bin):
    return freqscale[bin]

vec_bin_to_freq = np.vectorize(bin_to_freq)

############################################################

def downsample_bins(voice):
    voice_0 = np.array(voice.T[0::5]).T
    voice_1 = np.array(voice.T[1::5]).T
    voice_2 = np.array(voice.T[2::5]).T
    voice_3 = np.array(voice.T[3::5]).T
    voice_4 = np.array(voice.T[4::5]).T

    voice_0 = voice_0.T[1:70].T
    voice_1 = voice_1.T[1:70].T
    voice_2 = voice_2.T[1:70].T
    voice_3 = voice_3.T[0:69].T
    voice_4 = voice_4.T[0:69].T

    voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
    voice_argmax = np.argmax(voice_sums, axis=1)
    threshold = np.zeros(voice_sums.shape)
    threshold[np.arange(voice_argmax.size), voice_argmax] = 1
    threshold[:, 0] = 0
    voice_sums = threshold

    return voice_sums

############################################################

def bin_matrix_to_freq(matrix):
    s_freqs = vec_bin_to_freq(np.argmax(matrix[0], axis=0)).reshape(-1, 1)
    a_freqs = vec_bin_to_freq(np.argmax(matrix[1], axis=0)).reshape(-1, 1)
    t_freqs = vec_bin_to_freq(np.argmax(matrix[2], axis=0)).reshape(-1, 1)
    b_freqs = vec_bin_to_freq(np.argmax(matrix[3], axis=0)).reshape(-1, 1)
        
    freqs = np.concatenate((s_freqs, a_freqs, t_freqs, b_freqs), axis=1).T
    return freqs

############################################################

def create_midi(freq, write_path='./midi_track.mid', ticks_per_beat=58,
                tempo=90, save_to_file=True, program=53, channel=0):
    
    def freq_to_list(freq):
        # List event = (pitch, velocity, time)
        T = freq.shape[0]
        #midi_freqs = np.squeeze(midi_freqs)
        midi_freqs = np.round(69 + 12*np.log2(freq/440)).squeeze().astype('int')
        t_last = 0
        pitch_tm1 = 20
        list_event = []
        for t in range(T):
            pitch_t = midi_freqs[t]
            if (pitch_t != pitch_tm1):
                velocity = 127
                if(pitch_t == 24):
                    pitch_t = 0
                    velocity = 0
                t_event = t - t_last
                t_last = t
                list_event.append((pitch_tm1, 0, t_event))
                list_event.append((pitch_t, velocity, 0))
            pitch_tm1 = pitch_t
        list_event.append((pitch_tm1, 0, T - t_last))
        return list_event
    # Tempo
    microseconds_per_beat = mido.bpm2tempo(tempo)
    # Write a pianoroll in a midi file
    mid = mido.MidiFile()
    mid.ticks_per_beat = ticks_per_beat


    # Add a new track with the instrument name to the midi file
    track = mid.add_track("Voice Aah")
    # transform the matrix in a list of (pitch, velocity, time)
    events = freq_to_list(freq)
    #print(events)
    # Tempo
    track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
    track.append(mido.MetaMessage('channel_prefix', channel=channel))
    # Add the program_change
    #Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
    track.append(mido.Message('program_change', program=program, channel=channel))

    # This list is required to shut down
    # notes that are on, intensity modified, then off only 1 time
    # Example :
    # (60,20,0)
    # (60,40,10)
    # (60,0,15)
    notes_on_list = []
    # Write events in the midi file
    for event in events:
        pitch, velocity, time = event
        if velocity == 0:
            # Get the channel
            track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
            if(pitch in notes_on_list):
                notes_on_list.remove(pitch)
        else:
            if pitch in notes_on_list:
                track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
                notes_on_list.remove(pitch)
                time = 0
            track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
            notes_on_list.append(pitch)
    if save_to_file:
        mid.save(write_path)
    return mid

############################################################

def song_to_midi(sop, alto, ten, bass):

    savepath = './output.mid'

    bin_matrix = np.array([sop, alto, ten, bass])
    freq_matrix = bin_matrix_to_freq(bin_matrix)

    mid_sop = create_midi(freq_matrix[0], save_to_file=False, program=52, channel=0)
    mid_alto = create_midi(freq_matrix[1], save_to_file=False, program=52, channel=1)
    mid_ten = create_midi(freq_matrix[2], save_to_file=False, program=52, channel=2)
    mid_bass = create_midi(freq_matrix[3], save_to_file=False, program=52, channel=3)

    mid_mix = mido.MidiFile()
    mid_mix.ticks_per_beat=mid_sop.ticks_per_beat
    mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
    mid_mix.save(savepath)

    return savepath

############################################################

def song_to_dataframe(sop, alto, ten, bass):

    timescale = np.arange(0, 0.011609977 * (sop.shape[1]), 0.011609977)[:sop.shape[1]]

    s_argmax = vec_bin_to_freq(np.argmax(sop, axis=0))
    a_argmax = vec_bin_to_freq(np.argmax(alto, axis=0))
    t_argmax = vec_bin_to_freq(np.argmax(ten, axis=0))
    b_argmax = vec_bin_to_freq(np.argmax(bass, axis=0))

    data = np.array([timescale, s_argmax, a_argmax, t_argmax, b_argmax], dtype=np.float32).T
    columns = ['Timestep', 'Soprano', 'Alto', 'Tenor', 'Bass']

    df = pd.DataFrame(data, columns=columns)

    return df

############################################################

def prediction_postproc(input_array, argmax_and_threshold=True,
                                     gaussian_blur=True,
                                     threshold_value=0):
    prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
    thres_reference = deepcopy(prediction)
    if(argmax_and_threshold):
        prediction = np.argmax(prediction, axis=0)
        prediction = np.array([prediction[i] if thres_reference[prediction[i], i] >= threshold_value else 0 for i in np.arange(prediction.size)])
        threshold = np.zeros((360, prediction.shape[0]))
        threshold[prediction, np.arange(prediction.size)] = 1
        prediction = threshold
    if(gaussian_blur):
        prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
        prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
    return prediction

############################################################

def get_hcqt_params():

    bins_per_octave = 60
    n_octaves = 6
    over_sample = 5
    harmonics = [1, 2, 3, 4, 5]
    sr = 22050
    fmin = 32.7
    hop_length = 256

    return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample

############################################################

def create_pump_object():

    (bins_per_octave, n_octaves, harmonics,
     sr, f_min, hop_length, over_sample) = get_hcqt_params()

    p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
                                   fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)

    pump = pumpp.Pump(p_phdif)

    return pump

############################################################

def compute_pump_features(pump, audio_fpath):

    data = pump(audio_f=audio_fpath)

    return data

############################################################

def get_mpe_prediction(model, audio_file=None):
    """Generate output from a model given an input numpy file.
       Part of this function is part of deepsalience
    """

    split_value = 4000

    if audio_file is not None:

        pump = create_pump_object()
        features = compute_pump_features(pump, audio_file)
        input_hcqt = features['dphase/mag'][0]
        input_dphase = features['dphase/dphase'][0]

    else:
        raise ValueError("One audio_file must be specified")

    input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
    input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]

    n_t = input_hcqt.shape[3]
    t_slices = list(np.arange(0, n_t, split_value))
    output_list = []

    for t in t_slices:
        p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
                           np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
                          )[0, :, :]

        output_list.append(p)

    predicted_output = np.hstack(output_list).astype(np.float32)

    return predicted_output

############################################################

def get_va_prediction(model, f0_matrix):
    splits = f0_matrix.shape[1]//256
    splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
    fill = np.zeros((360, splits_diff))
    mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
    mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
    batches = math.ceil(mix_filled.shape[0]/24)

    s_pred_result = np.zeros((0, 360, 256))
    a_pred_result = np.zeros((0, 360, 256))
    t_pred_result = np.zeros((0, 360, 256))
    b_pred_result = np.zeros((0, 360, 256))

    for i in range(batches):
        s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
        s_pred_result = np.append(s_pred_result, s_pred, axis=0)
        a_pred_result = np.append(a_pred_result, a_pred, axis=0)
        t_pred_result = np.append(t_pred_result, t_pred, axis=0)
        b_pred_result = np.append(b_pred_result, b_pred, axis=0)

    s_pred_result = prediction_postproc(s_pred_result, threshold_value=SATB_THRESHOLDS[0])[:, :f0_matrix.shape[1]]
    a_pred_result = prediction_postproc(a_pred_result, threshold_value=SATB_THRESHOLDS[1])[:, :f0_matrix.shape[1]]
    t_pred_result = prediction_postproc(t_pred_result, threshold_value=SATB_THRESHOLDS[2])[:, :f0_matrix.shape[1]]
    b_pred_result = prediction_postproc(b_pred_result, threshold_value=SATB_THRESHOLDS[3])[:, :f0_matrix.shape[1]]

    return s_pred_result, a_pred_result, t_pred_result, b_pred_result

############################################################

def cqfe(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_v2_model()):
    
    savepath_csv = './output.csv'
    savepath_hdf5 = './output.hdf5'

    mpe_pred = get_mpe_prediction(mpe, audiofile)
    s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)

    output_midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)

    output_df = song_to_dataframe(s_pred, a_pred, t_pred, b_pred)
    output_df.to_csv(savepath_csv, mode='w', header=True)
    output_df.to_hdf(savepath_hdf5, key='F0', mode='w', complevel=9, complib='blosc', append=False, format='table')
    ax1 = output_df.plot.scatter(x='Timestep', y='Bass', s=1, color='#2f29e3', label='Bass')
    ax2 = output_df.plot.scatter(x='Timestep', y='Tenor', s=1, color='#e36129', label='Tenor', ax=ax1)
    ax3 = output_df.plot.scatter(x='Timestep', y='Alto', s=1, color='#29e35a', label='Alto', ax=ax1)
    ax4 = output_df.plot.scatter(x='Timestep', y='Soprano', s=1, color='#d3d921', label='Soprano', ax=ax1)
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Freq (Hz)')
    fig = ax1.get_figure()
    fig.set_dpi(150)

    return [output_midi, savepath_csv, savepath_hdf5], fig

############################################################