kevinwang676 commited on
Commit
169904c
·
verified ·
1 Parent(s): e045432

Upload folder using huggingface_hub

Browse files
uvr5/__pycache__/vr.cpython-310.pyc ADDED
Binary file (4.2 kB). View file
 
uvr5/lib/__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.98 kB). View file
 
uvr5/lib/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc ADDED
Binary file (4.05 kB). View file
 
uvr5/lib/lib_v5/__pycache__/model_param_init.cpython-310.pyc ADDED
Binary file (1.63 kB). View file
 
uvr5/lib/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc ADDED
Binary file (3.43 kB). View file
 
uvr5/lib/lib_v5/__pycache__/spec_utils.cpython-310.pyc ADDED
Binary file (13.6 kB). View file
 
uvr5/lib/lib_v5/layers_123821KB.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
uvr5/lib/lib_v5/model_param_init.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+
5
+ default_param = {}
6
+ default_param["bins"] = 768
7
+ default_param["unstable_bins"] = 9 # training only
8
+ default_param["reduction_bins"] = 762 # training only
9
+ default_param["sr"] = 44100
10
+ default_param["pre_filter_start"] = 757
11
+ default_param["pre_filter_stop"] = 768
12
+ default_param["band"] = {}
13
+
14
+
15
+ default_param["band"][1] = {
16
+ "sr": 11025,
17
+ "hl": 128,
18
+ "n_fft": 960,
19
+ "crop_start": 0,
20
+ "crop_stop": 245,
21
+ "lpf_start": 61, # inference only
22
+ "res_type": "polyphase",
23
+ }
24
+
25
+ default_param["band"][2] = {
26
+ "sr": 44100,
27
+ "hl": 512,
28
+ "n_fft": 1536,
29
+ "crop_start": 24,
30
+ "crop_stop": 547,
31
+ "hpf_start": 81, # inference only
32
+ "res_type": "sinc_best",
33
+ }
34
+
35
+
36
+ def int_keys(d):
37
+ r = {}
38
+ for k, v in d:
39
+ if k.isdigit():
40
+ k = int(k)
41
+ r[k] = v
42
+ return r
43
+
44
+
45
+ class ModelParameters(object):
46
+ def __init__(self, config_path=""):
47
+ if ".pth" == pathlib.Path(config_path).suffix:
48
+ import zipfile
49
+
50
+ with zipfile.ZipFile(config_path, "r") as zip:
51
+ self.param = json.loads(
52
+ zip.read("param.json"), object_pairs_hook=int_keys
53
+ )
54
+ elif ".json" == pathlib.Path(config_path).suffix:
55
+ with open(config_path, "r") as f:
56
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57
+ else:
58
+ self.param = default_param
59
+
60
+ for k in [
61
+ "mid_side",
62
+ "mid_side_b",
63
+ "mid_side_b2",
64
+ "stereo_w",
65
+ "stereo_n",
66
+ "reverse",
67
+ ]:
68
+ if not k in self.param:
69
+ self.param[k] = False
uvr5/lib/lib_v5/modelparams/4band_v2.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 637,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "res_type": "kaiser_fast"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 668,
53
+ "pre_filter_stop": 672
54
+ }
uvr5/lib/lib_v5/nets_61968KB.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+ from . import layers_123821KB as layers
6
+
7
+
8
+ class BaseASPPNet(nn.Module):
9
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
+ super(BaseASPPNet, self).__init__()
11
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
+
16
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
+
18
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
+
23
+ def __call__(self, x):
24
+ h, e1 = self.enc1(x)
25
+ h, e2 = self.enc2(h)
26
+ h, e3 = self.enc3(h)
27
+ h, e4 = self.enc4(h)
28
+
29
+ h = self.aspp(h)
30
+
31
+ h = self.dec4(h, e4)
32
+ h = self.dec3(h, e3)
33
+ h = self.dec2(h, e2)
34
+ h = self.dec1(h, e1)
35
+
36
+ return h
37
+
38
+
39
+ class CascadedASPPNet(nn.Module):
40
+ def __init__(self, n_fft):
41
+ super(CascadedASPPNet, self).__init__()
42
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
43
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
44
+
45
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
47
+
48
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
50
+
51
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
52
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
+
55
+ self.max_bin = n_fft // 2
56
+ self.output_bin = n_fft // 2 + 1
57
+
58
+ self.offset = 128
59
+
60
+ def forward(self, x, aggressiveness=None):
61
+ mix = x.detach()
62
+ x = x.clone()
63
+
64
+ x = x[:, :, : self.max_bin]
65
+
66
+ bandw = x.size()[2] // 2
67
+ aux1 = torch.cat(
68
+ [
69
+ self.stg1_low_band_net(x[:, :, :bandw]),
70
+ self.stg1_high_band_net(x[:, :, bandw:]),
71
+ ],
72
+ dim=2,
73
+ )
74
+
75
+ h = torch.cat([x, aux1], dim=1)
76
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
+
78
+ h = torch.cat([x, aux1, aux2], dim=1)
79
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
80
+
81
+ mask = torch.sigmoid(self.out(h))
82
+ mask = F.pad(
83
+ input=mask,
84
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
+ mode="replicate",
86
+ )
87
+
88
+ if self.training:
89
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
90
+ aux1 = F.pad(
91
+ input=aux1,
92
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
+ mode="replicate",
94
+ )
95
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
96
+ aux2 = F.pad(
97
+ input=aux2,
98
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
+ mode="replicate",
100
+ )
101
+ return mask * mix, aux1 * mix, aux2 * mix
102
+ else:
103
+ if aggressiveness:
104
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
+ mask[:, :, : aggressiveness["split_bin"]],
106
+ 1 + aggressiveness["value"] / 3,
107
+ )
108
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
+ mask[:, :, aggressiveness["split_bin"] :],
110
+ 1 + aggressiveness["value"],
111
+ )
112
+
113
+ return mask * mix
114
+
115
+ def predict(self, x_mag, aggressiveness=None):
116
+ h = self.forward(x_mag, aggressiveness)
117
+
118
+ if self.offset > 0:
119
+ h = h[:, :, :, self.offset : -self.offset]
120
+ assert h.size()[3] > 0
121
+
122
+ return h
uvr5/lib/lib_v5/spec_utils.py ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import math
4
+ import os
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import soundfile as sf
9
+ from tqdm import tqdm
10
+
11
+
12
+ def crop_center(h1, h2):
13
+ h1_shape = h1.size()
14
+ h2_shape = h2.size()
15
+
16
+ if h1_shape[3] == h2_shape[3]:
17
+ return h1
18
+ elif h1_shape[3] < h2_shape[3]:
19
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
20
+
21
+ # s_freq = (h2_shape[2] - h1_shape[2]) // 2
22
+ # e_freq = s_freq + h1_shape[2]
23
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
24
+ e_time = s_time + h2_shape[3]
25
+ h1 = h1[:, :, :, s_time:e_time]
26
+
27
+ return h1
28
+
29
+
30
+ def wave_to_spectrogram(
31
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
32
+ ):
33
+ if reverse:
34
+ wave_left = np.flip(np.asfortranarray(wave[0]))
35
+ wave_right = np.flip(np.asfortranarray(wave[1]))
36
+ elif mid_side:
37
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
38
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
39
+ elif mid_side_b2:
40
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
41
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
42
+ else:
43
+ wave_left = np.asfortranarray(wave[0])
44
+ wave_right = np.asfortranarray(wave[1])
45
+
46
+ spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
47
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
48
+
49
+ spec = np.asfortranarray([spec_left, spec_right])
50
+
51
+ return spec
52
+
53
+
54
+ def wave_to_spectrogram_mt(
55
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
56
+ ):
57
+ import threading
58
+
59
+ if reverse:
60
+ wave_left = np.flip(np.asfortranarray(wave[0]))
61
+ wave_right = np.flip(np.asfortranarray(wave[1]))
62
+ elif mid_side:
63
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
64
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
65
+ elif mid_side_b2:
66
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
67
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
68
+ else:
69
+ wave_left = np.asfortranarray(wave[0])
70
+ wave_right = np.asfortranarray(wave[1])
71
+
72
+ def run_thread(**kwargs):
73
+ global spec_left
74
+ spec_left = librosa.stft(**kwargs)
75
+
76
+ thread = threading.Thread(
77
+ target=run_thread,
78
+ kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
79
+ )
80
+ thread.start()
81
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
82
+ thread.join()
83
+
84
+ spec = np.asfortranarray([spec_left, spec_right])
85
+
86
+ return spec
87
+
88
+
89
+ def combine_spectrograms(specs, mp):
90
+ l = min([specs[i].shape[2] for i in specs])
91
+ spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
92
+ offset = 0
93
+ bands_n = len(mp.param["band"])
94
+
95
+ for d in range(1, bands_n + 1):
96
+ h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
97
+ spec_c[:, offset : offset + h, :l] = specs[d][
98
+ :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
99
+ ]
100
+ offset += h
101
+
102
+ if offset > mp.param["bins"]:
103
+ raise ValueError("Too much bins")
104
+
105
+ # lowpass fiter
106
+ if (
107
+ mp.param["pre_filter_start"] > 0
108
+ ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
109
+ if bands_n == 1:
110
+ spec_c = fft_lp_filter(
111
+ spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
112
+ )
113
+ else:
114
+ gp = 1
115
+ for b in range(
116
+ mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
117
+ ):
118
+ g = math.pow(
119
+ 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
120
+ )
121
+ gp = g
122
+ spec_c[:, b, :] *= g
123
+
124
+ return np.asfortranarray(spec_c)
125
+
126
+
127
+ def spectrogram_to_image(spec, mode="magnitude"):
128
+ if mode == "magnitude":
129
+ if np.iscomplexobj(spec):
130
+ y = np.abs(spec)
131
+ else:
132
+ y = spec
133
+ y = np.log10(y**2 + 1e-8)
134
+ elif mode == "phase":
135
+ if np.iscomplexobj(spec):
136
+ y = np.angle(spec)
137
+ else:
138
+ y = spec
139
+
140
+ y -= y.min()
141
+ y *= 255 / y.max()
142
+ img = np.uint8(y)
143
+
144
+ if y.ndim == 3:
145
+ img = img.transpose(1, 2, 0)
146
+ img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
147
+
148
+ return img
149
+
150
+
151
+ def reduce_vocal_aggressively(X, y, softmask):
152
+ v = X - y
153
+ y_mag_tmp = np.abs(y)
154
+ v_mag_tmp = np.abs(v)
155
+
156
+ v_mask = v_mag_tmp > y_mag_tmp
157
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
158
+
159
+ return y_mag * np.exp(1.0j * np.angle(y))
160
+
161
+
162
+ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
163
+ if min_range < fade_size * 2:
164
+ raise ValueError("min_range must be >= fade_area * 2")
165
+
166
+ mag = mag.copy()
167
+
168
+ idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
169
+ starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
170
+ ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
171
+ uninformative = np.where(ends - starts > min_range)[0]
172
+ if len(uninformative) > 0:
173
+ starts = starts[uninformative]
174
+ ends = ends[uninformative]
175
+ old_e = None
176
+ for s, e in zip(starts, ends):
177
+ if old_e is not None and s - old_e < fade_size:
178
+ s = old_e - fade_size * 2
179
+
180
+ if s != 0:
181
+ weight = np.linspace(0, 1, fade_size)
182
+ mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
183
+ else:
184
+ s -= fade_size
185
+
186
+ if e != mag.shape[2]:
187
+ weight = np.linspace(1, 0, fade_size)
188
+ mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
189
+ else:
190
+ e += fade_size
191
+
192
+ mag[:, :, s + fade_size : e - fade_size] += ref[
193
+ :, :, s + fade_size : e - fade_size
194
+ ]
195
+ old_e = e
196
+
197
+ return mag
198
+
199
+
200
+ def align_wave_head_and_tail(a, b):
201
+ l = min([a[0].size, b[0].size])
202
+
203
+ return a[:l, :l], b[:l, :l]
204
+
205
+
206
+ def cache_or_load(mix_path, inst_path, mp):
207
+ mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
208
+ inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
209
+
210
+ cache_dir = "mph{}".format(
211
+ hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
212
+ )
213
+ mix_cache_dir = os.path.join("cache", cache_dir)
214
+ inst_cache_dir = os.path.join("cache", cache_dir)
215
+
216
+ os.makedirs(mix_cache_dir, exist_ok=True)
217
+ os.makedirs(inst_cache_dir, exist_ok=True)
218
+
219
+ mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
220
+ inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
221
+
222
+ if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
223
+ X_spec_m = np.load(mix_cache_path)
224
+ y_spec_m = np.load(inst_cache_path)
225
+ else:
226
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
227
+
228
+ for d in range(len(mp.param["band"]), 0, -1):
229
+ bp = mp.param["band"][d]
230
+
231
+ if d == len(mp.param["band"]): # high-end band
232
+ X_wave[d], _ = librosa.load(
233
+ mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
234
+ )
235
+ y_wave[d], _ = librosa.load(
236
+ inst_path,
237
+ bp["sr"],
238
+ False,
239
+ dtype=np.float32,
240
+ res_type=bp["res_type"],
241
+ )
242
+ else: # lower bands
243
+ X_wave[d] = librosa.resample(
244
+ X_wave[d + 1],
245
+ mp.param["band"][d + 1]["sr"],
246
+ bp["sr"],
247
+ res_type=bp["res_type"],
248
+ )
249
+ y_wave[d] = librosa.resample(
250
+ y_wave[d + 1],
251
+ mp.param["band"][d + 1]["sr"],
252
+ bp["sr"],
253
+ res_type=bp["res_type"],
254
+ )
255
+
256
+ X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
257
+
258
+ X_spec_s[d] = wave_to_spectrogram(
259
+ X_wave[d],
260
+ bp["hl"],
261
+ bp["n_fft"],
262
+ mp.param["mid_side"],
263
+ mp.param["mid_side_b2"],
264
+ mp.param["reverse"],
265
+ )
266
+ y_spec_s[d] = wave_to_spectrogram(
267
+ y_wave[d],
268
+ bp["hl"],
269
+ bp["n_fft"],
270
+ mp.param["mid_side"],
271
+ mp.param["mid_side_b2"],
272
+ mp.param["reverse"],
273
+ )
274
+
275
+ del X_wave, y_wave
276
+
277
+ X_spec_m = combine_spectrograms(X_spec_s, mp)
278
+ y_spec_m = combine_spectrograms(y_spec_s, mp)
279
+
280
+ if X_spec_m.shape != y_spec_m.shape:
281
+ raise ValueError("The combined spectrograms are different: " + mix_path)
282
+
283
+ _, ext = os.path.splitext(mix_path)
284
+
285
+ np.save(mix_cache_path, X_spec_m)
286
+ np.save(inst_cache_path, y_spec_m)
287
+
288
+ return X_spec_m, y_spec_m
289
+
290
+
291
+ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
292
+ spec_left = np.asfortranarray(spec[0])
293
+ spec_right = np.asfortranarray(spec[1])
294
+
295
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
296
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
297
+
298
+ if reverse:
299
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
300
+ elif mid_side:
301
+ return np.asfortranarray(
302
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
303
+ )
304
+ elif mid_side_b2:
305
+ return np.asfortranarray(
306
+ [
307
+ np.add(wave_right / 1.25, 0.4 * wave_left),
308
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
309
+ ]
310
+ )
311
+ else:
312
+ return np.asfortranarray([wave_left, wave_right])
313
+
314
+
315
+ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
316
+ import threading
317
+
318
+ spec_left = np.asfortranarray(spec[0])
319
+ spec_right = np.asfortranarray(spec[1])
320
+
321
+ def run_thread(**kwargs):
322
+ global wave_left
323
+ wave_left = librosa.istft(**kwargs)
324
+
325
+ thread = threading.Thread(
326
+ target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
327
+ )
328
+ thread.start()
329
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
330
+ thread.join()
331
+
332
+ if reverse:
333
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
334
+ elif mid_side:
335
+ return np.asfortranarray(
336
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
337
+ )
338
+ elif mid_side_b2:
339
+ return np.asfortranarray(
340
+ [
341
+ np.add(wave_right / 1.25, 0.4 * wave_left),
342
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
343
+ ]
344
+ )
345
+ else:
346
+ return np.asfortranarray([wave_left, wave_right])
347
+
348
+
349
+ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
350
+ wave_band = {}
351
+ bands_n = len(mp.param["band"])
352
+ offset = 0
353
+
354
+ for d in range(1, bands_n + 1):
355
+ bp = mp.param["band"][d]
356
+ spec_s = np.ndarray(
357
+ shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
358
+ )
359
+ h = bp["crop_stop"] - bp["crop_start"]
360
+ spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
361
+ :, offset : offset + h, :
362
+ ]
363
+
364
+ offset += h
365
+ if d == bands_n: # higher
366
+ if extra_bins_h: # if --high_end_process bypass
367
+ max_bin = bp["n_fft"] // 2
368
+ spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
369
+ :, :extra_bins_h, :
370
+ ]
371
+ if bp["hpf_start"] > 0:
372
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
373
+ if bands_n == 1:
374
+ wave = spectrogram_to_wave(
375
+ spec_s,
376
+ bp["hl"],
377
+ mp.param["mid_side"],
378
+ mp.param["mid_side_b2"],
379
+ mp.param["reverse"],
380
+ )
381
+ else:
382
+ wave = np.add(
383
+ wave,
384
+ spectrogram_to_wave(
385
+ spec_s,
386
+ bp["hl"],
387
+ mp.param["mid_side"],
388
+ mp.param["mid_side_b2"],
389
+ mp.param["reverse"],
390
+ ),
391
+ )
392
+ else:
393
+ sr = mp.param["band"][d + 1]["sr"]
394
+ if d == 1: # lower
395
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
396
+ wave = librosa.resample(
397
+ spectrogram_to_wave(
398
+ spec_s,
399
+ bp["hl"],
400
+ mp.param["mid_side"],
401
+ mp.param["mid_side_b2"],
402
+ mp.param["reverse"],
403
+ ),
404
+ bp["sr"],
405
+ sr,
406
+ res_type="sinc_fastest",
407
+ )
408
+ else: # mid
409
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
410
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
411
+ wave2 = np.add(
412
+ wave,
413
+ spectrogram_to_wave(
414
+ spec_s,
415
+ bp["hl"],
416
+ mp.param["mid_side"],
417
+ mp.param["mid_side_b2"],
418
+ mp.param["reverse"],
419
+ ),
420
+ )
421
+ # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
422
+ wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
423
+
424
+ return wave.T
425
+
426
+
427
+ def fft_lp_filter(spec, bin_start, bin_stop):
428
+ g = 1.0
429
+ for b in range(bin_start, bin_stop):
430
+ g -= 1 / (bin_stop - bin_start)
431
+ spec[:, b, :] = g * spec[:, b, :]
432
+
433
+ spec[:, bin_stop:, :] *= 0
434
+
435
+ return spec
436
+
437
+
438
+ def fft_hp_filter(spec, bin_start, bin_stop):
439
+ g = 1.0
440
+ for b in range(bin_start, bin_stop, -1):
441
+ g -= 1 / (bin_start - bin_stop)
442
+ spec[:, b, :] = g * spec[:, b, :]
443
+
444
+ spec[:, 0 : bin_stop + 1, :] *= 0
445
+
446
+ return spec
447
+
448
+
449
+ def mirroring(a, spec_m, input_high_end, mp):
450
+ if "mirroring" == a:
451
+ mirror = np.flip(
452
+ np.abs(
453
+ spec_m[
454
+ :,
455
+ mp.param["pre_filter_start"]
456
+ - 10
457
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
458
+ - 10,
459
+ :,
460
+ ]
461
+ ),
462
+ 1,
463
+ )
464
+ mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
465
+
466
+ return np.where(
467
+ np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
468
+ )
469
+
470
+ if "mirroring2" == a:
471
+ mirror = np.flip(
472
+ np.abs(
473
+ spec_m[
474
+ :,
475
+ mp.param["pre_filter_start"]
476
+ - 10
477
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
478
+ - 10,
479
+ :,
480
+ ]
481
+ ),
482
+ 1,
483
+ )
484
+ mi = np.multiply(mirror, input_high_end * 1.7)
485
+
486
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
487
+
488
+
489
+ def ensembling(a, specs):
490
+ for i in range(1, len(specs)):
491
+ if i == 1:
492
+ spec = specs[0]
493
+
494
+ ln = min([spec.shape[2], specs[i].shape[2]])
495
+ spec = spec[:, :, :ln]
496
+ specs[i] = specs[i][:, :, :ln]
497
+
498
+ if "min_mag" == a:
499
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
500
+ if "max_mag" == a:
501
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
502
+
503
+ return spec
504
+
505
+
506
+ def stft(wave, nfft, hl):
507
+ wave_left = np.asfortranarray(wave[0])
508
+ wave_right = np.asfortranarray(wave[1])
509
+ spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
510
+ spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
511
+ spec = np.asfortranarray([spec_left, spec_right])
512
+
513
+ return spec
514
+
515
+
516
+ def istft(spec, hl):
517
+ spec_left = np.asfortranarray(spec[0])
518
+ spec_right = np.asfortranarray(spec[1])
519
+
520
+ wave_left = librosa.istft(spec_left, hop_length=hl)
521
+ wave_right = librosa.istft(spec_right, hop_length=hl)
522
+ wave = np.asfortranarray([wave_left, wave_right])
523
+
524
+
525
+ if __name__ == "__main__":
526
+ import argparse
527
+ import sys
528
+ import time
529
+
530
+ import cv2
531
+ from model_param_init import ModelParameters
532
+
533
+ p = argparse.ArgumentParser()
534
+ p.add_argument(
535
+ "--algorithm",
536
+ "-a",
537
+ type=str,
538
+ choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
539
+ default="min_mag",
540
+ )
541
+ p.add_argument(
542
+ "--model_params",
543
+ "-m",
544
+ type=str,
545
+ default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
546
+ )
547
+ p.add_argument("--output_name", "-o", type=str, default="output")
548
+ p.add_argument("--vocals_only", "-v", action="store_true")
549
+ p.add_argument("input", nargs="+")
550
+ args = p.parse_args()
551
+
552
+ start_time = time.time()
553
+
554
+ if args.algorithm.startswith("invert") and len(args.input) != 2:
555
+ raise ValueError("There should be two input files.")
556
+
557
+ if not args.algorithm.startswith("invert") and len(args.input) < 2:
558
+ raise ValueError("There must be at least two input files.")
559
+
560
+ wave, specs = {}, {}
561
+ mp = ModelParameters(args.model_params)
562
+
563
+ for i in range(len(args.input)):
564
+ spec = {}
565
+
566
+ for d in range(len(mp.param["band"]), 0, -1):
567
+ bp = mp.param["band"][d]
568
+
569
+ if d == len(mp.param["band"]): # high-end band
570
+ wave[d], _ = librosa.load(
571
+ args.input[i],
572
+ bp["sr"],
573
+ False,
574
+ dtype=np.float32,
575
+ res_type=bp["res_type"],
576
+ )
577
+
578
+ if len(wave[d].shape) == 1: # mono to stereo
579
+ wave[d] = np.array([wave[d], wave[d]])
580
+ else: # lower bands
581
+ wave[d] = librosa.resample(
582
+ wave[d + 1],
583
+ mp.param["band"][d + 1]["sr"],
584
+ bp["sr"],
585
+ res_type=bp["res_type"],
586
+ )
587
+
588
+ spec[d] = wave_to_spectrogram(
589
+ wave[d],
590
+ bp["hl"],
591
+ bp["n_fft"],
592
+ mp.param["mid_side"],
593
+ mp.param["mid_side_b2"],
594
+ mp.param["reverse"],
595
+ )
596
+
597
+ specs[i] = combine_spectrograms(spec, mp)
598
+
599
+ del wave
600
+
601
+ if args.algorithm == "deep":
602
+ d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
603
+ v_spec = d_spec - specs[1]
604
+ sf.write(
605
+ os.path.join("{}.wav".format(args.output_name)),
606
+ cmb_spectrogram_to_wave(v_spec, mp),
607
+ mp.param["sr"],
608
+ )
609
+
610
+ if args.algorithm.startswith("invert"):
611
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
612
+ specs[0] = specs[0][:, :, :ln]
613
+ specs[1] = specs[1][:, :, :ln]
614
+
615
+ if "invert_p" == args.algorithm:
616
+ X_mag = np.abs(specs[0])
617
+ y_mag = np.abs(specs[1])
618
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
619
+ v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
620
+ else:
621
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
622
+ v_spec = specs[0] - specs[1]
623
+
624
+ if not args.vocals_only:
625
+ X_mag = np.abs(specs[0])
626
+ y_mag = np.abs(specs[1])
627
+ v_mag = np.abs(v_spec)
628
+
629
+ X_image = spectrogram_to_image(X_mag)
630
+ y_image = spectrogram_to_image(y_mag)
631
+ v_image = spectrogram_to_image(v_mag)
632
+
633
+ cv2.imwrite("{}_X.png".format(args.output_name), X_image)
634
+ cv2.imwrite("{}_y.png".format(args.output_name), y_image)
635
+ cv2.imwrite("{}_v.png".format(args.output_name), v_image)
636
+
637
+ sf.write(
638
+ "{}_X.wav".format(args.output_name),
639
+ cmb_spectrogram_to_wave(specs[0], mp),
640
+ mp.param["sr"],
641
+ )
642
+ sf.write(
643
+ "{}_y.wav".format(args.output_name),
644
+ cmb_spectrogram_to_wave(specs[1], mp),
645
+ mp.param["sr"],
646
+ )
647
+
648
+ sf.write(
649
+ "{}_v.wav".format(args.output_name),
650
+ cmb_spectrogram_to_wave(v_spec, mp),
651
+ mp.param["sr"],
652
+ )
653
+ else:
654
+ if not args.algorithm == "deep":
655
+ sf.write(
656
+ os.path.join("ensembled", "{}.wav".format(args.output_name)),
657
+ cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
658
+ mp.param["sr"],
659
+ )
660
+
661
+ if args.algorithm == "align":
662
+ trackalignment = [
663
+ {
664
+ "file1": '"{}"'.format(args.input[0]),
665
+ "file2": '"{}"'.format(args.input[1]),
666
+ }
667
+ ]
668
+
669
+ for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
670
+ os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
671
+
672
+ # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
uvr5/lib/name_params.json ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "equivalent" : [
3
+ {
4
+ "model_hash_name" : [
5
+ {
6
+ "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
7
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
8
+ "param_name": "4band_44100"
9
+ },
10
+ {
11
+ "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
12
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
13
+ "param_name": "4band_v2"
14
+ },
15
+ {
16
+ "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
17
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
18
+ "param_name": "4band_v2"
19
+ },
20
+ {
21
+ "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
22
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
23
+ "param_name": "4band_44100"
24
+ },
25
+ {
26
+ "hash_name": "a82f14e75892e55e994376edbf0c8435",
27
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
28
+ "param_name": "4band_44100"
29
+ },
30
+ {
31
+ "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
32
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
33
+ "param_name": "4band_v2_sn"
34
+ },
35
+ {
36
+ "hash_name": "08611fb99bd59eaa79ad27c58d137727",
37
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
38
+ "param_name": "4band_v2_sn"
39
+ },
40
+ {
41
+ "hash_name": "5c7bbca45a187e81abbbd351606164e5",
42
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
43
+ "param_name": "3band_44100_msb2"
44
+ },
45
+ {
46
+ "hash_name": "d6b2cb685a058a091e5e7098192d3233",
47
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
48
+ "param_name": "3band_44100_msb2"
49
+ },
50
+ {
51
+ "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
52
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
53
+ "param_name": "4band_44100"
54
+ },
55
+ {
56
+ "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
57
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
58
+ "param_name": "4band_44100"
59
+ },
60
+ {
61
+ "hash_name": "68aa2c8093d0080704b200d140f59e54",
62
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
63
+ "param_name": "3band_44100"
64
+ },
65
+ {
66
+ "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
67
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
68
+ "param_name": "3band_44100_mid.json"
69
+ },
70
+ {
71
+ "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
72
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
73
+ "param_name": "3band_44100_mid.json"
74
+ },
75
+ {
76
+ "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
77
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
78
+ "param_name": "4band_44100.json"
79
+ },
80
+ {
81
+ "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
82
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
83
+ "param_name": "4band_44100.json"
84
+ },
85
+ {
86
+ "hash_name": "89e83b511ad474592689e562d5b1f80e",
87
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
88
+ "param_name": "2band_32000.json"
89
+ },
90
+ {
91
+ "hash_name": "0b954da81d453b716b114d6d7c95177f",
92
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
93
+ "param_name": "2band_32000.json"
94
+ }
95
+
96
+ ],
97
+ "v4 Models": [
98
+ {
99
+ "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
100
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
101
+ "param_name": "1band_sr16000_hl512"
102
+ },
103
+ {
104
+ "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
105
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
106
+ "param_name": "1band_sr32000_hl512"
107
+ },
108
+ {
109
+ "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
110
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
111
+ "param_name": "1band_sr32000_hl512"
112
+ },
113
+ {
114
+ "hash_name": "80ab74d65e515caa3622728d2de07d23",
115
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
116
+ "param_name": "1band_sr32000_hl512"
117
+ },
118
+ {
119
+ "hash_name": "edc115e7fc523245062200c00caa847f",
120
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
121
+ "param_name": "1band_sr33075_hl384"
122
+ },
123
+ {
124
+ "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
125
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
126
+ "param_name": "1band_sr33075_hl384"
127
+ },
128
+ {
129
+ "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
130
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
131
+ "param_name": "1band_sr44100_hl512"
132
+ },
133
+ {
134
+ "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
135
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
136
+ "param_name": "1band_sr44100_hl512"
137
+ },
138
+ {
139
+ "hash_name": "ae702fed0238afb5346db8356fe25f13",
140
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
141
+ "param_name": "1band_sr44100_hl1024"
142
+ }
143
+ ]
144
+ }
145
+ ],
146
+ "User Models" : [
147
+ {
148
+ "1 Band": [
149
+ {
150
+ "hash_name": "1band_sr16000_hl512",
151
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
152
+ "param_name": "1band_sr16000_hl512"
153
+ },
154
+ {
155
+ "hash_name": "1band_sr32000_hl512",
156
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
157
+ "param_name": "1band_sr16000_hl512"
158
+ },
159
+ {
160
+ "hash_name": "1band_sr33075_hl384",
161
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
162
+ "param_name": "1band_sr33075_hl384"
163
+ },
164
+ {
165
+ "hash_name": "1band_sr44100_hl256",
166
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl256.json",
167
+ "param_name": "1band_sr44100_hl256"
168
+ },
169
+ {
170
+ "hash_name": "1band_sr44100_hl512",
171
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
172
+ "param_name": "1band_sr44100_hl512"
173
+ },
174
+ {
175
+ "hash_name": "1band_sr44100_hl1024",
176
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
177
+ "param_name": "1band_sr44100_hl1024"
178
+ }
179
+ ],
180
+ "2 Band": [
181
+ {
182
+ "hash_name": "2band_44100_lofi",
183
+ "model_params": "lib/lib_v5/modelparams/2band_44100_lofi.json",
184
+ "param_name": "2band_44100_lofi"
185
+ },
186
+ {
187
+ "hash_name": "2band_32000",
188
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
189
+ "param_name": "2band_32000"
190
+ },
191
+ {
192
+ "hash_name": "2band_48000",
193
+ "model_params": "lib/lib_v5/modelparams/2band_48000.json",
194
+ "param_name": "2band_48000"
195
+ }
196
+ ],
197
+ "3 Band": [
198
+ {
199
+ "hash_name": "3band_44100",
200
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
201
+ "param_name": "3band_44100"
202
+ },
203
+ {
204
+ "hash_name": "3band_44100_mid",
205
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
206
+ "param_name": "3band_44100_mid"
207
+ },
208
+ {
209
+ "hash_name": "3band_44100_msb2",
210
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
211
+ "param_name": "3band_44100_msb2"
212
+ }
213
+ ],
214
+ "4 Band": [
215
+ {
216
+ "hash_name": "4band_44100",
217
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
218
+ "param_name": "4band_44100"
219
+ },
220
+ {
221
+ "hash_name": "4band_44100_mid",
222
+ "model_params": "lib/lib_v5/modelparams/4band_44100_mid.json",
223
+ "param_name": "4band_44100_mid"
224
+ },
225
+ {
226
+ "hash_name": "4band_44100_msb",
227
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb.json",
228
+ "param_name": "4band_44100_msb"
229
+ },
230
+ {
231
+ "hash_name": "4band_44100_msb2",
232
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb2.json",
233
+ "param_name": "4band_44100_msb2"
234
+ },
235
+ {
236
+ "hash_name": "4band_44100_reverse",
237
+ "model_params": "lib/lib_v5/modelparams/4band_44100_reverse.json",
238
+ "param_name": "4band_44100_reverse"
239
+ },
240
+ {
241
+ "hash_name": "4band_44100_sw",
242
+ "model_params": "lib/lib_v5/modelparams/4band_44100_sw.json",
243
+ "param_name": "4band_44100_sw"
244
+ },
245
+ {
246
+ "hash_name": "4band_v2",
247
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
248
+ "param_name": "4band_v2"
249
+ },
250
+ {
251
+ "hash_name": "4band_v2_sn",
252
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
253
+ "param_name": "4band_v2_sn"
254
+ },
255
+ {
256
+ "hash_name": "tmodelparam",
257
+ "model_params": "lib/lib_v5/modelparams/tmodelparam.json",
258
+ "param_name": "User Model Param Set"
259
+ }
260
+ ]
261
+ }
262
+ ]
263
+ }
uvr5/lib/utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import numpy as np
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+
8
+ def load_data(file_name: str = "./lib/name_params.json") -> dict:
9
+ with open(file_name, "r") as f:
10
+ data = json.load(f)
11
+
12
+ return data
13
+
14
+
15
+ def make_padding(width, cropsize, offset):
16
+ left = offset
17
+ roi_size = cropsize - left * 2
18
+ if roi_size == 0:
19
+ roi_size = cropsize
20
+ right = roi_size - (width % roi_size) + left
21
+
22
+ return left, right, roi_size
23
+
24
+
25
+ def inference(X_spec, device, model, aggressiveness, data):
26
+ """
27
+ data : dic configs
28
+ """
29
+
30
+ def _execute(
31
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
32
+ ):
33
+ model.eval()
34
+ with torch.no_grad():
35
+ preds = []
36
+
37
+ iterations = [n_window]
38
+
39
+ total_iterations = sum(iterations)
40
+ for i in tqdm(range(n_window)):
41
+ start = i * roi_size
42
+ X_mag_window = X_mag_pad[
43
+ None, :, :, start : start + data["window_size"]
44
+ ]
45
+ X_mag_window = torch.from_numpy(X_mag_window)
46
+ if is_half:
47
+ X_mag_window = X_mag_window.half()
48
+ X_mag_window = X_mag_window.to(device)
49
+
50
+ pred = model.predict(X_mag_window, aggressiveness)
51
+
52
+ pred = pred.detach().cpu().numpy()
53
+ preds.append(pred[0])
54
+
55
+ pred = np.concatenate(preds, axis=2)
56
+ return pred
57
+
58
+ def preprocess(X_spec):
59
+ X_mag = np.abs(X_spec)
60
+ X_phase = np.angle(X_spec)
61
+
62
+ return X_mag, X_phase
63
+
64
+ X_mag, X_phase = preprocess(X_spec)
65
+
66
+ coef = X_mag.max()
67
+ X_mag_pre = X_mag / coef
68
+
69
+ n_frame = X_mag_pre.shape[2]
70
+ pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
71
+ n_window = int(np.ceil(n_frame / roi_size))
72
+
73
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
74
+
75
+ if list(model.state_dict().values())[0].dtype == torch.float16:
76
+ is_half = True
77
+ else:
78
+ is_half = False
79
+ pred = _execute(
80
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
81
+ )
82
+ pred = pred[:, :, :n_frame]
83
+
84
+ if data["tta"]:
85
+ pad_l += roi_size // 2
86
+ pad_r += roi_size // 2
87
+ n_window += 1
88
+
89
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
90
+
91
+ pred_tta = _execute(
92
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
93
+ )
94
+ pred_tta = pred_tta[:, :, roi_size // 2 :]
95
+ pred_tta = pred_tta[:, :, :n_frame]
96
+
97
+ return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
98
+ else:
99
+ return pred * coef, X_mag, np.exp(1.0j * X_phase)
100
+
101
+
102
+ def _get_name_params(model_path, model_hash):
103
+ data = load_data()
104
+ flag = False
105
+ ModelName = model_path
106
+ for type in list(data):
107
+ for model in list(data[type][0]):
108
+ for i in range(len(data[type][0][model])):
109
+ if str(data[type][0][model][i]["hash_name"]) == model_hash:
110
+ flag = True
111
+ elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112
+ flag = True
113
+
114
+ if flag:
115
+ model_params_auto = data[type][0][model][i]["model_params"]
116
+ param_name_auto = data[type][0][model][i]["param_name"]
117
+ if type == "equivalent":
118
+ return param_name_auto, model_params_auto
119
+ else:
120
+ flag = False
121
+ return param_name_auto, model_params_auto
uvr5/uvr_model/UVR-HP2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05
3
+ size 63454827
uvr5/uvr_model/UVR-HP5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee
3
+ size 63454827
uvr5/uvr_model/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
uvr5/vr.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,sys
2
+ parent_directory = os.path.dirname(os.path.abspath(__file__))
3
+ import logging,pdb
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import soundfile as sf
9
+ import torch
10
+ from uvr5.lib.lib_v5 import nets_61968KB as Nets
11
+ from uvr5.lib.lib_v5 import spec_utils
12
+ from uvr5.lib.lib_v5.model_param_init import ModelParameters
13
+ from uvr5.lib.utils import inference
14
+
15
+
16
+ class AudioPre:
17
+ def __init__(self, agg, model_path, device, is_half, tta=False):
18
+ self.model_path = model_path
19
+ self.device = device
20
+ self.data = {
21
+ # Processing Options
22
+ "postprocess": False,
23
+ "tta": tta,
24
+ # Constants
25
+ "window_size": 512,
26
+ "agg": agg,
27
+ "high_end_process": "mirroring",
28
+ }
29
+ mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory)
30
+ model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
31
+ cpk = torch.load(model_path, map_location="cpu")
32
+ model.load_state_dict(cpk)
33
+ model.eval()
34
+ if is_half:
35
+ model = model.half().to(device)
36
+ else:
37
+ model = model.to(device)
38
+
39
+ self.mp = mp
40
+ self.model = model
41
+
42
+ def _path_audio_(
43
+ self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
44
+ ):
45
+ if ins_root is None and vocal_root is None:
46
+ return "No save root."
47
+ name = os.path.basename(music_file)
48
+ if ins_root is not None:
49
+ os.makedirs(ins_root, exist_ok=True)
50
+ if vocal_root is not None:
51
+ os.makedirs(vocal_root, exist_ok=True)
52
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
53
+ bands_n = len(self.mp.param["band"])
54
+ # print(bands_n)
55
+ for d in range(bands_n, 0, -1):
56
+ bp = self.mp.param["band"][d]
57
+ if d == bands_n: # high-end band
58
+ (
59
+ X_wave[d],
60
+ _,
61
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
62
+ music_file,
63
+ bp["sr"],
64
+ False,
65
+ dtype=np.float32,
66
+ res_type=bp["res_type"],
67
+ )
68
+ if X_wave[d].ndim == 1:
69
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
70
+ else: # lower bands
71
+ X_wave[d] = librosa.core.resample(
72
+ X_wave[d + 1],
73
+ self.mp.param["band"][d + 1]["sr"],
74
+ bp["sr"],
75
+ res_type=bp["res_type"],
76
+ )
77
+ # Stft of wave source
78
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
79
+ X_wave[d],
80
+ bp["hl"],
81
+ bp["n_fft"],
82
+ self.mp.param["mid_side"],
83
+ self.mp.param["mid_side_b2"],
84
+ self.mp.param["reverse"],
85
+ )
86
+ # pdb.set_trace()
87
+ if d == bands_n and self.data["high_end_process"] != "none":
88
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
89
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
90
+ )
91
+ input_high_end = X_spec_s[d][
92
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
93
+ ]
94
+
95
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
96
+ aggresive_set = float(self.data["agg"] / 100)
97
+ aggressiveness = {
98
+ "value": aggresive_set,
99
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
100
+ }
101
+ with torch.no_grad():
102
+ pred, X_mag, X_phase = inference(
103
+ X_spec_m, self.device, self.model, aggressiveness, self.data
104
+ )
105
+ # Postprocess
106
+ if self.data["postprocess"]:
107
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
108
+ pred = spec_utils.mask_silence(pred, pred_inv)
109
+ y_spec_m = pred * X_phase
110
+ v_spec_m = X_spec_m - y_spec_m
111
+
112
+ if is_hp3 == True:
113
+ ins_root,vocal_root = vocal_root,ins_root
114
+
115
+ if ins_root is not None:
116
+ if self.data["high_end_process"].startswith("mirroring"):
117
+ input_high_end_ = spec_utils.mirroring(
118
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
119
+ )
120
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
121
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
122
+ )
123
+ else:
124
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
125
+ logger.info("%s instruments done" % name)
126
+ if is_hp3 == True:
127
+ head = "vocal_"
128
+ else:
129
+ head = "instrument_"
130
+ if format in ["wav", "flac"]:
131
+ sf.write(
132
+ os.path.join(
133
+ ins_root,
134
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
135
+ ),
136
+ (np.array(wav_instrument)).astype("float32"),
137
+ self.mp.param["sr"],
138
+ ) #
139
+ else:
140
+ path = os.path.join(
141
+ ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
142
+ )
143
+ sf.write(
144
+ path,
145
+ (np.array(wav_instrument)).astype("float32"),
146
+ self.mp.param["sr"],
147
+ )
148
+ if os.path.exists(path):
149
+ opt_format_path = path[:-4] + ".%s" % format
150
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
151
+ if os.path.exists(opt_format_path):
152
+ try:
153
+ os.remove(path)
154
+ except:
155
+ pass
156
+ if vocal_root is not None:
157
+ if is_hp3 == True:
158
+ head = "instrument_"
159
+ else:
160
+ head = "vocal_"
161
+ if self.data["high_end_process"].startswith("mirroring"):
162
+ input_high_end_ = spec_utils.mirroring(
163
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
164
+ )
165
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
166
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
167
+ )
168
+ else:
169
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
170
+ logger.info("%s vocals done" % name)
171
+ if format in ["wav", "flac"]:
172
+ sf.write(
173
+ os.path.join(
174
+ vocal_root,
175
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
176
+ ),
177
+ (np.array(wav_vocals)).astype("float32"),
178
+ self.mp.param["sr"],
179
+ )
180
+ else:
181
+ path = os.path.join(
182
+ vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
183
+ )
184
+ sf.write(
185
+ path,
186
+ (np.array(wav_vocals)).astype("float32"),
187
+ self.mp.param["sr"],
188
+ )
189
+ if os.path.exists(path):
190
+ opt_format_path = path[:-4] + ".%s" % format
191
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
192
+ if os.path.exists(opt_format_path):
193
+ try:
194
+ os.remove(path)
195
+ except:
196
+ pass