Spaces:

Xornotor
/

Choral-Quartets-F0-Extractor

Runtime error

File size: 11,792 Bytes

import os
import math
import mido
import pumpp
import librosa
import numpy as np
import pandas as pd
from copy import deepcopy
from scipy.ndimage import gaussian_filter1d
from cqfe_models import mask_voas_cnn_v2_model, late_deep_cnn_model

SATB_THRESHOLDS = [0.23, 0.17, 0.15, 0.17]

############################################################

freqscale = librosa.cqt_frequencies(n_bins=360, fmin=32.7, bins_per_octave=60)

def bin_to_freq(bin):
    return freqscale[bin]

vec_bin_to_freq = np.vectorize(bin_to_freq)

############################################################

def downsample_bins(voice):
    voice_0 = np.array(voice.T[0::5]).T
    voice_1 = np.array(voice.T[1::5]).T
    voice_2 = np.array(voice.T[2::5]).T
    voice_3 = np.array(voice.T[3::5]).T
    voice_4 = np.array(voice.T[4::5]).T

    voice_0 = voice_0.T[1:70].T
    voice_1 = voice_1.T[1:70].T
    voice_2 = voice_2.T[1:70].T
    voice_3 = voice_3.T[0:69].T
    voice_4 = voice_4.T[0:69].T

    voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
    voice_argmax = np.argmax(voice_sums, axis=1)
    threshold = np.zeros(voice_sums.shape)
    threshold[np.arange(voice_argmax.size), voice_argmax] = 1
    threshold[:, 0] = 0
    voice_sums = threshold

    return voice_sums

############################################################

def bin_matrix_to_freq(matrix):
    s_freqs = vec_bin_to_freq(np.argmax(matrix[0], axis=0)).reshape(-1, 1)
    a_freqs = vec_bin_to_freq(np.argmax(matrix[1], axis=0)).reshape(-1, 1)
    t_freqs = vec_bin_to_freq(np.argmax(matrix[2], axis=0)).reshape(-1, 1)
    b_freqs = vec_bin_to_freq(np.argmax(matrix[3], axis=0)).reshape(-1, 1)
        
    freqs = np.concatenate((s_freqs, a_freqs, t_freqs, b_freqs), axis=1).T
    return freqs

############################################################

def create_midi(freq, write_path='./midi_track.mid', ticks_per_beat=58,
                tempo=90, save_to_file=True, program=53, channel=0):
    
    def freq_to_list(freq):
        # List event = (pitch, velocity, time)
        T = freq.shape[0]
        #midi_freqs = np.squeeze(midi_freqs)
        midi_freqs = np.round(69 + 12*np.log2(freq/440)).squeeze().astype('int')
        t_last = 0
        pitch_tm1 = 20
        list_event = []
        for t in range(T):
            pitch_t = midi_freqs[t]
            if (pitch_t != pitch_tm1):
                velocity = 127
                if(pitch_t == 24):
                    pitch_t = 0
                    velocity = 0
                t_event = t - t_last
                t_last = t
                list_event.append((pitch_tm1, 0, t_event))
                list_event.append((pitch_t, velocity, 0))
            pitch_tm1 = pitch_t
        list_event.append((pitch_tm1, 0, T - t_last))
        return list_event
    # Tempo
    microseconds_per_beat = mido.bpm2tempo(tempo)
    # Write a pianoroll in a midi file
    mid = mido.MidiFile()
    mid.ticks_per_beat = ticks_per_beat


    # Add a new track with the instrument name to the midi file
    track = mid.add_track("Voice Aah")
    # transform the matrix in a list of (pitch, velocity, time)
    events = freq_to_list(freq)
    #print(events)
    # Tempo
    track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
    track.append(mido.MetaMessage('channel_prefix', channel=channel))
    # Add the program_change
    #Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
    track.append(mido.Message('program_change', program=program, channel=channel))

    # This list is required to shut down
    # notes that are on, intensity modified, then off only 1 time
    # Example :
    # (60,20,0)
    # (60,40,10)
    # (60,0,15)
    notes_on_list = []
    # Write events in the midi file
    for event in events:
        pitch, velocity, time = event
        if velocity == 0:
            # Get the channel
            track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
            if(pitch in notes_on_list):
                notes_on_list.remove(pitch)
        else:
            if pitch in notes_on_list:
                track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
                notes_on_list.remove(pitch)
                time = 0
            track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
            notes_on_list.append(pitch)
    if save_to_file:
        mid.save(write_path)
    return mid

############################################################

def song_to_midi(sop, alto, ten, bass):

    savepath = './output.mid'

    bin_matrix = np.array([sop, alto, ten, bass])
    freq_matrix = bin_matrix_to_freq(bin_matrix)

    mid_sop = create_midi(freq_matrix[0], save_to_file=False, program=52, channel=0)
    mid_alto = create_midi(freq_matrix[1], save_to_file=False, program=52, channel=1)
    mid_ten = create_midi(freq_matrix[2], save_to_file=False, program=52, channel=2)
    mid_bass = create_midi(freq_matrix[3], save_to_file=False, program=52, channel=3)

    mid_mix = mido.MidiFile()
    mid_mix.ticks_per_beat=mid_sop.ticks_per_beat
    mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
    mid_mix.save(savepath)

    return savepath

############################################################

def song_to_dataframe(sop, alto, ten, bass):

    timescale = np.arange(0, 0.011609977 * (sop.shape[1]), 0.011609977)[:sop.shape[1]]

    s_argmax = vec_bin_to_freq(np.argmax(sop, axis=0))
    a_argmax = vec_bin_to_freq(np.argmax(alto, axis=0))
    t_argmax = vec_bin_to_freq(np.argmax(ten, axis=0))
    b_argmax = vec_bin_to_freq(np.argmax(bass, axis=0))

    data = np.array([timescale, s_argmax, a_argmax, t_argmax, b_argmax], dtype=np.float32).T
    columns = ['Timestep', 'Soprano', 'Alto', 'Tenor', 'Bass']

    df = pd.DataFrame(data, columns=columns)

    return df

############################################################

def prediction_postproc(input_array, argmax_and_threshold=True,
                                     gaussian_blur=True,
                                     threshold_value=0):
    prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
    thres_reference = deepcopy(prediction)
    if(argmax_and_threshold):
        prediction = np.argmax(prediction, axis=0)
        prediction = np.array([prediction[i] if thres_reference[prediction[i], i] >= threshold_value else 0 for i in np.arange(prediction.size)])
        threshold = np.zeros((360, prediction.shape[0]))
        threshold[prediction, np.arange(prediction.size)] = 1
        prediction = threshold
    if(gaussian_blur):
        prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
        prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
    return prediction

############################################################

def get_hcqt_params():

    bins_per_octave = 60
    n_octaves = 6
    over_sample = 5
    harmonics = [1, 2, 3, 4, 5]
    sr = 22050
    fmin = 32.7
    hop_length = 256

    return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample

############################################################

def create_pump_object():

    (bins_per_octave, n_octaves, harmonics,
     sr, f_min, hop_length, over_sample) = get_hcqt_params()

    p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
                                   fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)

    pump = pumpp.Pump(p_phdif)

    return pump

############################################################

def compute_pump_features(pump, audio_fpath):

    data = pump(audio_f=audio_fpath)

    return data

############################################################

def get_mpe_prediction(model, audio_file=None):
    """Generate output from a model given an input numpy file.
       Part of this function is part of deepsalience
    """

    split_value = 4000

    if audio_file is not None:

        pump = create_pump_object()
        features = compute_pump_features(pump, audio_file)
        input_hcqt = features['dphase/mag'][0]
        input_dphase = features['dphase/dphase'][0]

    else:
        raise ValueError("One audio_file must be specified")

    input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
    input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]

    n_t = input_hcqt.shape[3]
    t_slices = list(np.arange(0, n_t, split_value))
    output_list = []

    for t in t_slices:
        p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
                           np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
                          )[0, :, :]

        output_list.append(p)

    predicted_output = np.hstack(output_list).astype(np.float32)

    return predicted_output

############################################################

def get_va_prediction(model, f0_matrix):
    splits = f0_matrix.shape[1]//256
    splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
    fill = np.zeros((360, splits_diff))
    mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
    mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
    batches = math.ceil(mix_filled.shape[0]/24)

    s_pred_result = np.zeros((0, 360, 256))
    a_pred_result = np.zeros((0, 360, 256))
    t_pred_result = np.zeros((0, 360, 256))
    b_pred_result = np.zeros((0, 360, 256))

    for i in range(batches):
        s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
        s_pred_result = np.append(s_pred_result, s_pred, axis=0)
        a_pred_result = np.append(a_pred_result, a_pred, axis=0)
        t_pred_result = np.append(t_pred_result, t_pred, axis=0)
        b_pred_result = np.append(b_pred_result, b_pred, axis=0)

    s_pred_result = prediction_postproc(s_pred_result, threshold_value=SATB_THRESHOLDS[0])[:, :f0_matrix.shape[1]]
    a_pred_result = prediction_postproc(a_pred_result, threshold_value=SATB_THRESHOLDS[1])[:, :f0_matrix.shape[1]]
    t_pred_result = prediction_postproc(t_pred_result, threshold_value=SATB_THRESHOLDS[2])[:, :f0_matrix.shape[1]]
    b_pred_result = prediction_postproc(b_pred_result, threshold_value=SATB_THRESHOLDS[3])[:, :f0_matrix.shape[1]]

    return s_pred_result, a_pred_result, t_pred_result, b_pred_result

############################################################

def cqfe(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_v2_model()):
    
    savepath_csv = './output.csv'
    savepath_hdf5 = './output.hdf5'

    mpe_pred = get_mpe_prediction(mpe, audiofile)
    s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)

    output_midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)

    output_df = song_to_dataframe(s_pred, a_pred, t_pred, b_pred)
    output_df.to_csv(savepath_csv, mode='w', header=True)
    output_df.to_hdf(savepath_hdf5, key='F0', mode='w', complevel=9, complib='blosc', append=False, format='table')
    ax1 = output_df.plot.scatter(x='Timestep', y='Bass', s=1, color='#2f29e3', label='Bass')
    ax2 = output_df.plot.scatter(x='Timestep', y='Tenor', s=1, color='#e36129', label='Tenor', ax=ax1)
    ax3 = output_df.plot.scatter(x='Timestep', y='Alto', s=1, color='#29e35a', label='Alto', ax=ax1)
    ax4 = output_df.plot.scatter(x='Timestep', y='Soprano', s=1, color='#d3d921', label='Soprano', ax=ax1)
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Freq (Hz)')
    fig = ax1.get_figure()
    fig.set_dpi(150)

    return [output_midi, savepath_csv, savepath_hdf5], fig

############################################################