File size: 9,351 Bytes
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d7895a
 
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
3d7895a
2b8a8b8
3d7895a
2b8a8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import os
import math
import mido
import pumpp
import numpy as np
from scipy.ndimage import gaussian_filter1d
from cq2m_models import mask_voas_cnn_model, late_deep_cnn_model

############################################################

def downsample_bins(voice):
    voice_0 = np.array(voice.T[0::5]).T
    voice_1 = np.array(voice.T[1::5]).T
    voice_2 = np.array(voice.T[2::5]).T
    voice_3 = np.array(voice.T[3::5]).T
    voice_4 = np.array(voice.T[4::5]).T

    voice_0 = voice_0.T[1:70].T
    voice_1 = voice_1.T[1:70].T
    voice_2 = voice_2.T[1:70].T
    voice_3 = voice_3.T[0:69].T
    voice_4 = voice_4.T[0:69].T

    voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
    voice_argmax = np.argmax(voice_sums, axis=1)
    threshold = np.zeros(voice_sums.shape)
    threshold[np.arange(voice_argmax.size), voice_argmax] = 1
    threshold[:, 0] = 0
    voice_sums = threshold

    return voice_sums

############################################################

def create_midi(pr, write_path='./midi_track.mid', ticks_per_beat=58,
                tempo=90, save_to_file=True, program=53, channel=0):

    def pr_to_list(pr):
        # List event = (pitch, velocity, time)
        T, N = pr.shape
        t_last = 0
        pr_tm1 = np.zeros(N)
        list_event = []
        for t in range(T):
            pr_t = pr[t]
            mask = (pr_t != pr_tm1)
            if(N == 360): range_step = 5
            else: range_step = 1
            if (mask).any():
                for n in range(0, N):
                    if mask[n]:
                        if(N <= 72):
                            pitch = 25 + n
                        else:
                            pitch = 24 + round(n/5)
                        if int(pr_t[n] * 127) >= 50:
                            velocity = 127
                        else:
                            velocity = 0
                        # Time is incremented since last event
                        t_event = t - t_last
                        t_last = t
                        list_event.append((pitch, velocity, t_event))
            pr_tm1 = pr_t
        list_event.append((0, 0, T - t_last))
        return list_event
    # Tempo
    microseconds_per_beat = mido.bpm2tempo(tempo)
    # Write a pianoroll in a midi file
    mid = mido.MidiFile()
    mid.ticks_per_beat = ticks_per_beat


    # Add a new track with the instrument name to the midi file
    track = mid.add_track("Voice Aah")
    # transform the matrix in a list of (pitch, velocity, time)
    events = pr_to_list(pr)
    #print(events)
    # Tempo
    track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
    track.append(mido.MetaMessage('channel_prefix', channel=channel))
    # Add the program_change
    #Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
    track.append(mido.Message('program_change', program=program, channel=channel))

    # This list is required to shut down
    # notes that are on, intensity modified, then off only 1 time
    # Example :
    # (60,20,0)
    # (60,40,10)
    # (60,0,15)
    notes_on_list = []
    # Write events in the midi file
    for event in events:
        pitch, velocity, time = event
        if velocity == 0:
            # Get the channel
            track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
            if(pitch in notes_on_list):
                notes_on_list.remove(pitch)
        else:
            if pitch in notes_on_list:
                track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
                notes_on_list.remove(pitch)
                time = 0
            track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
            notes_on_list.append(pitch)
    if save_to_file:
        mid.save(write_path)
    return mid

############################################################

def song_to_midi(sop, alto, ten, bass):

    savepath = './output.mid'

    down_sop = downsample_bins(sop.T)
    down_alto = downsample_bins(alto.T)
    down_ten = downsample_bins(ten.T)
    down_bass = downsample_bins(bass.T)

    mid_sop = create_midi(down_sop, save_to_file=False, program=52, channel=0)
    mid_alto = create_midi(down_alto, save_to_file=False, program=53, channel=1)
    mid_ten = create_midi(down_ten, save_to_file=False, program=49, channel=2)
    mid_bass = create_midi(down_bass, save_to_file=False, program=50, channel=3)

    mid_mix = mido.MidiFile()
    mid_mix.ticks_per_beat = mid_sop.ticks_per_beat
    mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
    mid_mix.save(savepath)

    return savepath

############################################################

def prediction_postproc(input_array, argmax_and_threshold=True, gaussian_blur=True):
    prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
    if(argmax_and_threshold):
        prediction = np.argmax(prediction, axis=0)
        prediction = np.array([i if i <= 357 else 0 for i in prediction])
        threshold = np.zeros((360, prediction.shape[0]))
        threshold[prediction, np.arange(prediction.size)] = 1
        prediction = threshold
    if(gaussian_blur):
        prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
        prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
    return prediction

############################################################

def get_hcqt_params():

    bins_per_octave = 60
    n_octaves = 6
    over_sample = 5
    harmonics = [1, 2, 3, 4, 5]
    sr = 22050
    fmin = 32.7
    hop_length = 256

    return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample

############################################################

def create_pump_object():

    (bins_per_octave, n_octaves, harmonics,
     sr, f_min, hop_length, over_sample) = get_hcqt_params()

    p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
                                   fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)

    pump = pumpp.Pump(p_phdif)

    return pump

############################################################

def compute_pump_features(pump, audio_fpath):

    data = pump(audio_f=audio_fpath)

    return data

############################################################

def get_mpe_prediction(model, audio_file=None):
    """Generate output from a model given an input numpy file.
       Part of this function is part of deepsalience
    """

    split_value = 2500

    if audio_file is not None:

        pump = create_pump_object()
        features = compute_pump_features(pump, audio_file)
        input_hcqt = features['dphase/mag'][0]
        input_dphase = features['dphase/dphase'][0]

    else:
        raise ValueError("One audio_file must be specified")

    input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
    input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]

    n_t = input_hcqt.shape[3]
    t_slices = list(np.arange(0, n_t, split_value))
    output_list = []

    for t in t_slices:
        p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
                           np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
                          )[0, :, :]

        output_list.append(p)

    predicted_output = np.hstack(output_list).astype(np.float32)
    return predicted_output

############################################################

def get_va_prediction(model, f0_matrix):
    splits = f0_matrix.shape[1]//256
    splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
    fill = np.zeros((360, splits_diff))
    mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
    mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
    batches = math.ceil(mix_filled.shape[0]/24)

    s_pred_result = np.zeros((0, 360, 256))
    a_pred_result = np.zeros((0, 360, 256))
    t_pred_result = np.zeros((0, 360, 256))
    b_pred_result = np.zeros((0, 360, 256))

    for i in range(batches):
        s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
        s_pred_result = np.append(s_pred_result, s_pred, axis=0)
        a_pred_result = np.append(a_pred_result, a_pred, axis=0)
        t_pred_result = np.append(t_pred_result, t_pred, axis=0)
        b_pred_result = np.append(b_pred_result, b_pred, axis=0)

    s_pred_result = prediction_postproc(s_pred_result)[:, :f0_matrix.shape[1]]
    a_pred_result = prediction_postproc(a_pred_result)[:, :f0_matrix.shape[1]]
    t_pred_result = prediction_postproc(t_pred_result)[:, :f0_matrix.shape[1]]
    b_pred_result = prediction_postproc(b_pred_result)[:, :f0_matrix.shape[1]]

    return s_pred_result, a_pred_result, t_pred_result, b_pred_result

############################################################

def cq2m(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_model()):
    mpe_pred = get_mpe_prediction(mpe, audiofile)
    s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)
    midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)
    return midi

############################################################