Initial files for application
Browse files- .gitignore +1 -0
- Checkpoints/.gitattributes +35 -0
- Checkpoints/mask_voas.keras +0 -0
- Checkpoints/mask_voas_v2.keras +0 -0
- app_test.ipynb +37 -0
- cq2m_models.py +321 -0
- cq2m_utils.py +257 -0
- pyproject.toml +23 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/*
|
Checkpoints/.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Checkpoints/mask_voas.keras
ADDED
Binary file (856 kB). View file
|
|
Checkpoints/mask_voas_v2.keras
ADDED
Binary file (661 kB). View file
|
|
app_test.ipynb
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import gradio as gr\n",
|
10 |
+
"import cq2m_utils\n",
|
11 |
+
"#midi = cq2m_utils.cq2m(audiofile)"
|
12 |
+
]
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"metadata": {
|
16 |
+
"kernelspec": {
|
17 |
+
"display_name": "tf",
|
18 |
+
"language": "python",
|
19 |
+
"name": "python3"
|
20 |
+
},
|
21 |
+
"language_info": {
|
22 |
+
"codemirror_mode": {
|
23 |
+
"name": "ipython",
|
24 |
+
"version": 3
|
25 |
+
},
|
26 |
+
"file_extension": ".py",
|
27 |
+
"mimetype": "text/x-python",
|
28 |
+
"name": "python",
|
29 |
+
"nbconvert_exporter": "python",
|
30 |
+
"pygments_lexer": "ipython3",
|
31 |
+
"version": "3.11.4"
|
32 |
+
},
|
33 |
+
"orig_nbformat": 4
|
34 |
+
},
|
35 |
+
"nbformat": 4,
|
36 |
+
"nbformat_minor": 2
|
37 |
+
}
|
cq2m_models.py
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow.keras import Model
|
3 |
+
from tensorflow.keras.optimizers import Adam
|
4 |
+
from tensorflow.keras.losses import BinaryCrossentropy, Reduction
|
5 |
+
from tensorflow.keras.layers import Input, Resizing, Conv2D, BatchNormalization, Multiply, Lambda, Concatenate
|
6 |
+
import tensorflow.keras.backend as K
|
7 |
+
|
8 |
+
EPOCHS = 10
|
9 |
+
TRAINING_DTYPE = tf.float16
|
10 |
+
SPLIT_SIZE = 256
|
11 |
+
BATCH_SIZE = 24
|
12 |
+
LEARNING_RATE = 5e-3
|
13 |
+
RESIZING_FILTER = 'bilinear'
|
14 |
+
|
15 |
+
############################################################
|
16 |
+
|
17 |
+
def mask_voas_cnn_model(l_rate = LEARNING_RATE):
|
18 |
+
x_in = Input(shape=(360, SPLIT_SIZE, 1))
|
19 |
+
|
20 |
+
x = Resizing(90, int(SPLIT_SIZE/2), RESIZING_FILTER,
|
21 |
+
name="downscale")(x_in)
|
22 |
+
|
23 |
+
x = BatchNormalization()(x)
|
24 |
+
|
25 |
+
x = Conv2D(filters=32, kernel_size=(3, 3), padding="same",
|
26 |
+
activation="relu", name="conv1")(x)
|
27 |
+
|
28 |
+
x = BatchNormalization()(x)
|
29 |
+
|
30 |
+
x = Conv2D(filters=32, kernel_size=(3, 3), padding="same",
|
31 |
+
activation="relu", name="conv2")(x)
|
32 |
+
|
33 |
+
x = BatchNormalization()(x)
|
34 |
+
|
35 |
+
x = Conv2D(filters=16, kernel_size=(70, 3), padding="same",
|
36 |
+
activation="relu", name="conv_harm_1")(x)
|
37 |
+
|
38 |
+
x = BatchNormalization()(x)
|
39 |
+
|
40 |
+
x = Conv2D(filters=16, kernel_size=(70, 3), padding="same",
|
41 |
+
activation="relu", name="conv_harm_2")(x)
|
42 |
+
|
43 |
+
x = BatchNormalization()(x)
|
44 |
+
|
45 |
+
## "masking" original input with trained data
|
46 |
+
|
47 |
+
x = Resizing(360, SPLIT_SIZE, RESIZING_FILTER,
|
48 |
+
name="upscale")(x)
|
49 |
+
|
50 |
+
x = Multiply(name="multiply_mask")([x, x_in])
|
51 |
+
|
52 |
+
## start four branches now
|
53 |
+
|
54 |
+
## branch 1
|
55 |
+
x1a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
56 |
+
activation="relu", name="conv1a")(x)
|
57 |
+
|
58 |
+
x1a = BatchNormalization()(x1a)
|
59 |
+
|
60 |
+
x1b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
61 |
+
activation="relu", name="conv1b")(x1a)
|
62 |
+
|
63 |
+
## branch 2
|
64 |
+
x2a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
65 |
+
activation="relu", name="conv2a")(x)
|
66 |
+
|
67 |
+
x2a = BatchNormalization()(x2a)
|
68 |
+
|
69 |
+
x2b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
70 |
+
activation="relu", name="conv2b")(x2a)
|
71 |
+
|
72 |
+
## branch 3
|
73 |
+
|
74 |
+
x3a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
75 |
+
activation="relu", name="conv3a")(x)
|
76 |
+
|
77 |
+
x3a = BatchNormalization()(x3a)
|
78 |
+
|
79 |
+
x3b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
80 |
+
activation="relu", name="conv3b")(x3a)
|
81 |
+
|
82 |
+
x4a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
83 |
+
activation="relu", name="conv4a")(x)
|
84 |
+
|
85 |
+
x4a = BatchNormalization()(x4a)
|
86 |
+
|
87 |
+
x4b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
88 |
+
activation="relu", name="conv4b"
|
89 |
+
)(x4a)
|
90 |
+
|
91 |
+
|
92 |
+
y1 = Conv2D(filters=1, kernel_size=1, name='conv_soprano',
|
93 |
+
padding='same', activation='sigmoid')(x1b)
|
94 |
+
y1 = tf.squeeze(y1, axis=-1, name='sop')
|
95 |
+
|
96 |
+
y2 = Conv2D(filters=1, kernel_size=1, name='conv_alto',
|
97 |
+
padding='same', activation='sigmoid')(x2b)
|
98 |
+
y2 = tf.squeeze(y2, axis=-1, name='alt')
|
99 |
+
|
100 |
+
y3 = Conv2D(filters=1, kernel_size=1, name='conv_tenor',
|
101 |
+
padding='same', activation='sigmoid')(x3b)
|
102 |
+
y3 = tf.squeeze(y3, axis=-1, name='ten')
|
103 |
+
|
104 |
+
y4 = Conv2D(filters=1, kernel_size=1, name='conv_bass',
|
105 |
+
padding='same', activation='sigmoid')(x4b)
|
106 |
+
y4 = tf.squeeze(y4, axis=-1, name='bas')
|
107 |
+
|
108 |
+
out = [y1, y2, y3, y4]
|
109 |
+
|
110 |
+
model = Model(inputs=x_in, outputs=out, name='MaskVoasCNN')
|
111 |
+
|
112 |
+
model.compile(optimizer=Adam(learning_rate=l_rate),
|
113 |
+
loss=BinaryCrossentropy(reduction=Reduction.SUM_OVER_BATCH_SIZE))
|
114 |
+
|
115 |
+
model.load_weights('./Checkpoints/mask_voas.keras')
|
116 |
+
|
117 |
+
return model
|
118 |
+
|
119 |
+
############################################################
|
120 |
+
|
121 |
+
def mask_voas_cnn_v2_model(l_rate = LEARNING_RATE):
|
122 |
+
x_in = Input(shape=(360, SPLIT_SIZE, 1))
|
123 |
+
|
124 |
+
x = Resizing(90, int(SPLIT_SIZE/2), RESIZING_FILTER,
|
125 |
+
name="downscale")(x_in)
|
126 |
+
|
127 |
+
x = BatchNormalization()(x)
|
128 |
+
|
129 |
+
x = Conv2D(filters=32, kernel_size=(3, 3), padding="same",
|
130 |
+
activation="relu", name="conv1")(x)
|
131 |
+
|
132 |
+
x = BatchNormalization()(x)
|
133 |
+
|
134 |
+
x = Conv2D(filters=32, kernel_size=(3, 3), padding="same",
|
135 |
+
activation="relu", name="conv2")(x)
|
136 |
+
|
137 |
+
x = BatchNormalization()(x)
|
138 |
+
|
139 |
+
x = Conv2D(filters=16, kernel_size=(48, 3), padding="same",
|
140 |
+
activation="relu", name="conv_harm_1")(x)
|
141 |
+
|
142 |
+
x = BatchNormalization()(x)
|
143 |
+
|
144 |
+
x = Conv2D(filters=16, kernel_size=(48, 3), padding="same",
|
145 |
+
activation="relu", name="conv_harm_2")(x)
|
146 |
+
|
147 |
+
x = BatchNormalization()(x)
|
148 |
+
|
149 |
+
x = Conv2D(filters=16, kernel_size=1, padding="same",
|
150 |
+
activation="sigmoid", name="conv_sigmoid_before_mask")(x)
|
151 |
+
|
152 |
+
## "masking" original input with trained data
|
153 |
+
|
154 |
+
x = Resizing(360, SPLIT_SIZE, RESIZING_FILTER,
|
155 |
+
name="upscale")(x)
|
156 |
+
|
157 |
+
x = Multiply(name="multiply_mask")([x, x_in])
|
158 |
+
|
159 |
+
x = BatchNormalization()(x)
|
160 |
+
|
161 |
+
## start four branches now
|
162 |
+
|
163 |
+
## branch 1
|
164 |
+
x1a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
165 |
+
activation="relu", name="conv1a")(x)
|
166 |
+
|
167 |
+
x1a = BatchNormalization()(x1a)
|
168 |
+
|
169 |
+
x1b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
170 |
+
activation="relu", name="conv1b")(x1a)
|
171 |
+
|
172 |
+
## branch 2
|
173 |
+
x2a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
174 |
+
activation="relu", name="conv2a")(x)
|
175 |
+
|
176 |
+
x2a = BatchNormalization()(x2a)
|
177 |
+
|
178 |
+
x2b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
179 |
+
activation="relu", name="conv2b")(x2a)
|
180 |
+
|
181 |
+
## branch 3
|
182 |
+
|
183 |
+
x3a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
184 |
+
activation="relu", name="conv3a")(x)
|
185 |
+
|
186 |
+
x3a = BatchNormalization()(x3a)
|
187 |
+
|
188 |
+
x3b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
189 |
+
activation="relu", name="conv3b")(x3a)
|
190 |
+
|
191 |
+
x4a = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
192 |
+
activation="relu", name="conv4a")(x)
|
193 |
+
|
194 |
+
x4a = BatchNormalization()(x4a)
|
195 |
+
|
196 |
+
x4b = Conv2D(filters=16, kernel_size=(3, 3), padding="same",
|
197 |
+
activation="relu", name="conv4b"
|
198 |
+
)(x4a)
|
199 |
+
|
200 |
+
|
201 |
+
y1 = Conv2D(filters=1, kernel_size=1, name='conv_soprano',
|
202 |
+
padding='same', activation='sigmoid')(x1b)
|
203 |
+
y1 = tf.squeeze(y1, axis=-1, name='sop')
|
204 |
+
|
205 |
+
y2 = Conv2D(filters=1, kernel_size=1, name='conv_alto',
|
206 |
+
padding='same', activation='sigmoid')(x2b)
|
207 |
+
y2 = tf.squeeze(y2, axis=-1, name='alt')
|
208 |
+
|
209 |
+
y3 = Conv2D(filters=1, kernel_size=1, name='conv_tenor',
|
210 |
+
padding='same', activation='sigmoid')(x3b)
|
211 |
+
y3 = tf.squeeze(y3, axis=-1, name='ten')
|
212 |
+
|
213 |
+
y4 = Conv2D(filters=1, kernel_size=1, name='conv_bass',
|
214 |
+
padding='same', activation='sigmoid')(x4b)
|
215 |
+
y4 = tf.squeeze(y4, axis=-1, name='bas')
|
216 |
+
|
217 |
+
out = [y1, y2, y3, y4]
|
218 |
+
|
219 |
+
model = Model(inputs=x_in, outputs=out, name='MaskVoasCNNv2')
|
220 |
+
|
221 |
+
model.compile(optimizer=Adam(learning_rate=l_rate),
|
222 |
+
loss=BinaryCrossentropy(reduction=Reduction.SUM_OVER_BATCH_SIZE))
|
223 |
+
|
224 |
+
model.load_weights('./Checkpoints/mask_voas_v2.keras')
|
225 |
+
|
226 |
+
return model
|
227 |
+
|
228 |
+
############################################################
|
229 |
+
|
230 |
+
def __base_model(input, let):
|
231 |
+
|
232 |
+
b1 = BatchNormalization()(input)
|
233 |
+
|
234 |
+
# conv1
|
235 |
+
y1 = Conv2D(16, (5, 5), padding='same', activation='relu', name='conv1{}'.format(let))(b1)
|
236 |
+
y1a = BatchNormalization()(y1)
|
237 |
+
|
238 |
+
# conv2
|
239 |
+
y2 = Conv2D(32, (5, 5), padding='same', activation='relu', name='conv2{}'.format(let))(y1a)
|
240 |
+
y2a = BatchNormalization()(y2)
|
241 |
+
|
242 |
+
# conv3
|
243 |
+
y3 = Conv2D(32, (5, 5), padding='same', activation='relu', name='conv3{}'.format(let))(y2a)
|
244 |
+
y3a = BatchNormalization()(y3)
|
245 |
+
|
246 |
+
# conv4 layer
|
247 |
+
y4 = Conv2D(32, (5, 5), padding='same', activation='relu', name='conv4{}'.format(let))(y3a)
|
248 |
+
y4a = BatchNormalization()(y4)
|
249 |
+
|
250 |
+
# conv5 layer, harm1
|
251 |
+
y5 = Conv2D(32, (70, 3), padding='same', activation='relu', name='harm1{}'.format(let))(y4a)
|
252 |
+
y5a = BatchNormalization()(y5)
|
253 |
+
|
254 |
+
# conv6 layer, harm2
|
255 |
+
y6 = Conv2D(32, (70, 3), padding='same', activation='relu', name='harm2{}'.format(let))(y5a)
|
256 |
+
y6a = BatchNormalization()(y6)
|
257 |
+
|
258 |
+
return y6a, input
|
259 |
+
|
260 |
+
|
261 |
+
def late_deep_cnn_model():
|
262 |
+
'''Late/Deep
|
263 |
+
'''
|
264 |
+
|
265 |
+
input_shape_1 = (None, None, 5) # HCQT input shape
|
266 |
+
input_shape_2 = (None, None, 5) # phase differentials input shape
|
267 |
+
|
268 |
+
inputs1 = Input(shape=input_shape_1)
|
269 |
+
inputs2 = Input(shape=input_shape_2)
|
270 |
+
|
271 |
+
y6a, _ = __base_model(inputs1, 'a')
|
272 |
+
y6b, _ = __base_model(inputs2, 'b')
|
273 |
+
|
274 |
+
# concatenate features
|
275 |
+
y6c = Concatenate()([y6a, y6b])
|
276 |
+
|
277 |
+
# conv7 layer
|
278 |
+
y7 = Conv2D(64, (3, 3), padding='same', activation='relu', name='conv7')(y6c)
|
279 |
+
y7a = BatchNormalization()(y7)
|
280 |
+
|
281 |
+
# conv8 layer
|
282 |
+
y8 = Conv2D(64, (3, 3), padding='same', activation='relu', name='conv8')(y7a)
|
283 |
+
y8a = BatchNormalization()(y8)
|
284 |
+
|
285 |
+
y9 = Conv2D(8, (360, 1), padding='same', activation='relu', name='distribution')(y8a)
|
286 |
+
y9a = BatchNormalization()(y9)
|
287 |
+
|
288 |
+
y10 = Conv2D(1, (1, 1), padding='same', activation='sigmoid', name='squishy')(y9a)
|
289 |
+
predictions = Lambda(lambda x: K.squeeze(x, axis=3))(y10)
|
290 |
+
|
291 |
+
model = Model(inputs=[inputs1, inputs2], outputs=predictions)
|
292 |
+
|
293 |
+
model.compile(
|
294 |
+
loss=__bkld, metrics=['mse', __soft_binary_accuracy],
|
295 |
+
optimizer='adam'
|
296 |
+
)
|
297 |
+
|
298 |
+
model.load_weights('./Checkpoints/exp3multif0.pkl')
|
299 |
+
|
300 |
+
return model
|
301 |
+
|
302 |
+
############################################################
|
303 |
+
|
304 |
+
def __bkld(y_true, y_pred):
|
305 |
+
"""Brian's KL Divergence implementation
|
306 |
+
"""
|
307 |
+
y_true = K.clip(y_true, K.epsilon(), 1.0 - K.epsilon())
|
308 |
+
y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
|
309 |
+
return K.mean(K.mean(
|
310 |
+
-1.0*y_true* K.log(y_pred) - (1.0 - y_true) * K.log(1.0 - y_pred),
|
311 |
+
axis=-1), axis=-1)
|
312 |
+
|
313 |
+
############################################################
|
314 |
+
|
315 |
+
def __soft_binary_accuracy(y_true, y_pred):
|
316 |
+
"""Binary accuracy that works when inputs are probabilities
|
317 |
+
"""
|
318 |
+
return K.mean(K.mean(
|
319 |
+
K.equal(K.round(y_true), K.round(y_pred)), axis=-1), axis=-1)
|
320 |
+
|
321 |
+
############################################################
|
cq2m_utils.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import mido
|
4 |
+
import pumpp
|
5 |
+
import numpy as np
|
6 |
+
from scipy.ndimage import gaussian_filter1d
|
7 |
+
from cq2m_models import mask_voas_cnn_model, late_deep_cnn_model
|
8 |
+
|
9 |
+
############################################################
|
10 |
+
|
11 |
+
def downsample_bins(voice):
|
12 |
+
voice_0 = np.array(voice.T[0::5]).T
|
13 |
+
voice_1 = np.array(voice.T[1::5]).T
|
14 |
+
voice_2 = np.array(voice.T[2::5]).T
|
15 |
+
voice_3 = np.array(voice.T[3::5]).T
|
16 |
+
voice_4 = np.array(voice.T[4::5]).T
|
17 |
+
|
18 |
+
voice_0 = voice_0.T[1:70].T
|
19 |
+
voice_1 = voice_1.T[1:70].T
|
20 |
+
voice_2 = voice_2.T[1:70].T
|
21 |
+
voice_3 = voice_3.T[0:69].T
|
22 |
+
voice_4 = voice_4.T[0:69].T
|
23 |
+
|
24 |
+
voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
|
25 |
+
voice_argmax = np.argmax(voice_sums, axis=1)
|
26 |
+
threshold = np.zeros(voice_sums.shape)
|
27 |
+
threshold[np.arange(voice_argmax.size), voice_argmax] = 1
|
28 |
+
threshold[:, 0] = 0
|
29 |
+
voice_sums = threshold
|
30 |
+
|
31 |
+
return voice_sums
|
32 |
+
|
33 |
+
############################################################
|
34 |
+
|
35 |
+
def create_midi(pr, write_path='./midi_track.mid', ticks_per_beat=58,
|
36 |
+
tempo=90, save_to_file=True, program=53, channel=0):
|
37 |
+
|
38 |
+
def pr_to_list(pr):
|
39 |
+
# List event = (pitch, velocity, time)
|
40 |
+
T, N = pr.shape
|
41 |
+
t_last = 0
|
42 |
+
pr_tm1 = np.zeros(N)
|
43 |
+
list_event = []
|
44 |
+
for t in range(T):
|
45 |
+
pr_t = pr[t]
|
46 |
+
mask = (pr_t != pr_tm1)
|
47 |
+
if(N == 360): range_step = 5
|
48 |
+
else: range_step = 1
|
49 |
+
if (mask).any():
|
50 |
+
for n in range(0, N):
|
51 |
+
if mask[n]:
|
52 |
+
if(N <= 72):
|
53 |
+
pitch = 25 + n
|
54 |
+
else:
|
55 |
+
pitch = 24 + round(n/5)
|
56 |
+
if int(pr_t[n] * 127) >= 50:
|
57 |
+
velocity = 127
|
58 |
+
else:
|
59 |
+
velocity = 0
|
60 |
+
# Time is incremented since last event
|
61 |
+
t_event = t - t_last
|
62 |
+
t_last = t
|
63 |
+
list_event.append((pitch, velocity, t_event))
|
64 |
+
pr_tm1 = pr_t
|
65 |
+
list_event.append((0, 0, T - t_last))
|
66 |
+
return list_event
|
67 |
+
# Tempo
|
68 |
+
microseconds_per_beat = mido.bpm2tempo(tempo)
|
69 |
+
# Write a pianoroll in a midi file
|
70 |
+
mid = mido.MidiFile()
|
71 |
+
mid.ticks_per_beat = ticks_per_beat
|
72 |
+
|
73 |
+
|
74 |
+
# Add a new track with the instrument name to the midi file
|
75 |
+
track = mid.add_track("Voice Aah")
|
76 |
+
# transform the matrix in a list of (pitch, velocity, time)
|
77 |
+
events = pr_to_list(pr)
|
78 |
+
#print(events)
|
79 |
+
# Tempo
|
80 |
+
track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
|
81 |
+
track.append(mido.MetaMessage('channel_prefix', channel=channel))
|
82 |
+
# Add the program_change
|
83 |
+
#Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
|
84 |
+
track.append(mido.Message('program_change', program=program, channel=channel))
|
85 |
+
|
86 |
+
# This list is required to shut down
|
87 |
+
# notes that are on, intensity modified, then off only 1 time
|
88 |
+
# Example :
|
89 |
+
# (60,20,0)
|
90 |
+
# (60,40,10)
|
91 |
+
# (60,0,15)
|
92 |
+
notes_on_list = []
|
93 |
+
# Write events in the midi file
|
94 |
+
for event in events:
|
95 |
+
pitch, velocity, time = event
|
96 |
+
if velocity == 0:
|
97 |
+
# Get the channel
|
98 |
+
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
|
99 |
+
if(pitch in notes_on_list):
|
100 |
+
notes_on_list.remove(pitch)
|
101 |
+
else:
|
102 |
+
if pitch in notes_on_list:
|
103 |
+
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
|
104 |
+
notes_on_list.remove(pitch)
|
105 |
+
time = 0
|
106 |
+
track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
|
107 |
+
notes_on_list.append(pitch)
|
108 |
+
if save_to_file:
|
109 |
+
mid.save(write_path)
|
110 |
+
return mid
|
111 |
+
|
112 |
+
############################################################
|
113 |
+
|
114 |
+
def song_to_midi(sop, alto, ten, bass):
|
115 |
+
|
116 |
+
down_sop = downsample_bins(sop.T)
|
117 |
+
down_alto = downsample_bins(alto.T)
|
118 |
+
down_ten = downsample_bins(ten.T)
|
119 |
+
down_bass = downsample_bins(bass.T)
|
120 |
+
|
121 |
+
mid_sop = create_midi(down_sop, save_to_file=False, program=52, channel=0)
|
122 |
+
mid_alto = create_midi(down_alto, save_to_file=False, program=53, channel=1)
|
123 |
+
mid_ten = create_midi(down_ten, save_to_file=False, program=49, channel=2)
|
124 |
+
mid_bass = create_midi(down_bass, save_to_file=False, program=50, channel=3)
|
125 |
+
|
126 |
+
mid_mix = mido.MidiFile()
|
127 |
+
mid_mix.ticks_per_beat = mid_sop.ticks_per_beat
|
128 |
+
mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
|
129 |
+
mid_mix.save('./result.mid')
|
130 |
+
|
131 |
+
return mid_mix
|
132 |
+
|
133 |
+
############################################################
|
134 |
+
|
135 |
+
def prediction_postproc(input_array, argmax_and_threshold=True, gaussian_blur=True):
|
136 |
+
prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
|
137 |
+
if(argmax_and_threshold):
|
138 |
+
prediction = np.argmax(prediction, axis=0)
|
139 |
+
prediction = np.array([i if i <= 357 else 0 for i in prediction])
|
140 |
+
threshold = np.zeros((360, prediction.shape[0]))
|
141 |
+
threshold[prediction, np.arange(prediction.size)] = 1
|
142 |
+
prediction = threshold
|
143 |
+
if(gaussian_blur):
|
144 |
+
prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
|
145 |
+
prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
|
146 |
+
return prediction
|
147 |
+
|
148 |
+
############################################################
|
149 |
+
|
150 |
+
def get_hcqt_params():
|
151 |
+
|
152 |
+
bins_per_octave = 60
|
153 |
+
n_octaves = 6
|
154 |
+
over_sample = 5
|
155 |
+
harmonics = [1, 2, 3, 4, 5]
|
156 |
+
sr = 22050
|
157 |
+
fmin = 32.7
|
158 |
+
hop_length = 256
|
159 |
+
|
160 |
+
return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample
|
161 |
+
|
162 |
+
############################################################
|
163 |
+
|
164 |
+
def create_pump_object():
|
165 |
+
|
166 |
+
(bins_per_octave, n_octaves, harmonics,
|
167 |
+
sr, f_min, hop_length, over_sample) = get_hcqt_params()
|
168 |
+
|
169 |
+
p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
|
170 |
+
fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)
|
171 |
+
|
172 |
+
pump = pumpp.Pump(p_phdif)
|
173 |
+
|
174 |
+
return pump
|
175 |
+
|
176 |
+
############################################################
|
177 |
+
|
178 |
+
def compute_pump_features(pump, audio_fpath):
|
179 |
+
|
180 |
+
data = pump(audio_f=audio_fpath)
|
181 |
+
|
182 |
+
return data
|
183 |
+
|
184 |
+
############################################################
|
185 |
+
|
186 |
+
def get_mpe_prediction(model, audio_file=None):
|
187 |
+
"""Generate output from a model given an input numpy file.
|
188 |
+
Part of this function is part of deepsalience
|
189 |
+
"""
|
190 |
+
|
191 |
+
split_value = 2500
|
192 |
+
|
193 |
+
if audio_file is not None:
|
194 |
+
|
195 |
+
pump = create_pump_object()
|
196 |
+
features = compute_pump_features(pump, audio_file)
|
197 |
+
input_hcqt = features['dphase/mag'][0]
|
198 |
+
input_dphase = features['dphase/dphase'][0]
|
199 |
+
|
200 |
+
else:
|
201 |
+
raise ValueError("One audio_file must be specified")
|
202 |
+
|
203 |
+
input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
|
204 |
+
input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]
|
205 |
+
|
206 |
+
n_t = input_hcqt.shape[3]
|
207 |
+
t_slices = list(np.arange(0, n_t, split_value))
|
208 |
+
output_list = []
|
209 |
+
|
210 |
+
for t in t_slices:
|
211 |
+
p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
|
212 |
+
np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
|
213 |
+
)[0, :, :]
|
214 |
+
|
215 |
+
output_list.append(p)
|
216 |
+
|
217 |
+
predicted_output = np.hstack(output_list).astype(np.float32)
|
218 |
+
return predicted_output
|
219 |
+
|
220 |
+
############################################################
|
221 |
+
|
222 |
+
def get_va_prediction(model, f0_matrix):
|
223 |
+
splits = f0_matrix.shape[1]//256
|
224 |
+
splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
|
225 |
+
fill = np.zeros((360, splits_diff))
|
226 |
+
mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
|
227 |
+
mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
|
228 |
+
batches = math.ceil(mix_filled.shape[0]/24)
|
229 |
+
|
230 |
+
s_pred_result = np.zeros((0, 360, 256))
|
231 |
+
a_pred_result = np.zeros((0, 360, 256))
|
232 |
+
t_pred_result = np.zeros((0, 360, 256))
|
233 |
+
b_pred_result = np.zeros((0, 360, 256))
|
234 |
+
|
235 |
+
for i in range(batches):
|
236 |
+
s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
|
237 |
+
s_pred_result = np.append(s_pred_result, s_pred, axis=0)
|
238 |
+
a_pred_result = np.append(a_pred_result, a_pred, axis=0)
|
239 |
+
t_pred_result = np.append(t_pred_result, t_pred, axis=0)
|
240 |
+
b_pred_result = np.append(b_pred_result, b_pred, axis=0)
|
241 |
+
|
242 |
+
s_pred_result = prediction_postproc(s_pred_result)[:, :f0_matrix.shape[1]]
|
243 |
+
a_pred_result = prediction_postproc(a_pred_result)[:, :f0_matrix.shape[1]]
|
244 |
+
t_pred_result = prediction_postproc(t_pred_result)[:, :f0_matrix.shape[1]]
|
245 |
+
b_pred_result = prediction_postproc(b_pred_result)[:, :f0_matrix.shape[1]]
|
246 |
+
|
247 |
+
return s_pred_result, a_pred_result, t_pred_result, b_pred_result
|
248 |
+
|
249 |
+
############################################################
|
250 |
+
|
251 |
+
def cq2m(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_model()):
|
252 |
+
mpe_pred = get_mpe_prediction(mpe, audiofile)
|
253 |
+
s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)
|
254 |
+
midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)
|
255 |
+
return midi
|
256 |
+
|
257 |
+
############################################################
|
pyproject.toml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "choral-quartets-to-midi"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "An application that uses Multi-Pitch Estimation and Voice Assignment to transform .WAV files with Choral Quartets recordings into MIDI files, with a separate track for each voice. Based on Late/DeepCNN by Helena Cuesta and MaskVoasCNN by André Paiva."
|
5 |
+
authors = ["André Paiva (Xornotor) <[email protected]>"]
|
6 |
+
license = "cc"
|
7 |
+
readme = "README.md"
|
8 |
+
packages = [{include = "choral_quartets_to_midi"}]
|
9 |
+
|
10 |
+
[tool.poetry.dependencies]
|
11 |
+
python = "^3.11"
|
12 |
+
tensorflow = "2.13.0"
|
13 |
+
gradio = "3.37.0"
|
14 |
+
typing-extensions = "4.5.0"
|
15 |
+
mido = "1.2.10"
|
16 |
+
pumpp = "0.6.0"
|
17 |
+
numpy = "1.24.3"
|
18 |
+
scipy = "1.11.1"
|
19 |
+
|
20 |
+
|
21 |
+
[build-system]
|
22 |
+
requires = ["poetry-core"]
|
23 |
+
build-backend = "poetry.core.masonry.api"
|