|
import hashlib |
|
import json |
|
import logging |
|
import os |
|
import time |
|
from pathlib import Path |
|
import io |
|
import librosa |
|
import maad |
|
import numpy as np |
|
from inference import slicer |
|
import parselmouth |
|
import soundfile |
|
import torch |
|
import torchaudio |
|
|
|
|
|
import utils |
|
from models import SynthesizerTrn |
|
logging.getLogger('numba').setLevel(logging.WARNING) |
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
|
|
|
|
def resize2d_f0(x, target_len): |
|
source = np.array(x) |
|
source[source < 0.001] = np.nan |
|
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), |
|
source) |
|
res = np.nan_to_num(target) |
|
return res |
|
|
|
|
|
def get_f0(x, p_len, f0_up_key=0): |
|
|
|
time_step = 160 / 16000 * 1000 |
|
f0_min = 50 |
|
f0_max = 1100 |
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700) |
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700) |
|
|
|
f0 = parselmouth.Sound(x, 16000).to_pitch_ac( |
|
time_step=time_step / 1000, voicing_threshold=0.6, |
|
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] |
|
|
|
pad_size = (p_len - len(f0) + 1) // 2 |
|
if(pad_size > 0 or p_len - len(f0) - pad_size > 0): |
|
f0 = np.pad( |
|
f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') |
|
|
|
f0 *= pow(2, f0_up_key / 12) |
|
f0_mel = 1127 * np.log(1 + f0 / 700) |
|
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \ |
|
254 / (f0_mel_max - f0_mel_min) + 1 |
|
f0_mel[f0_mel <= 1] = 1 |
|
f0_mel[f0_mel > 255] = 255 |
|
f0_coarse = np.rint(f0_mel).astype(np.int) |
|
return f0_coarse, f0 |
|
|
|
|
|
def clean_pitch(input_pitch): |
|
num_nan = np.sum(input_pitch == 1) |
|
if num_nan / len(input_pitch) > 0.9: |
|
input_pitch[input_pitch != 1] = 1 |
|
return input_pitch |
|
|
|
|
|
def plt_pitch(input_pitch): |
|
input_pitch = input_pitch.astype(float) |
|
input_pitch[input_pitch == 1] = np.nan |
|
return input_pitch |
|
|
|
|
|
def f0_to_pitch(ff): |
|
f0_pitch = 69 + 12 * np.log2(ff / 440) |
|
return f0_pitch |
|
|
|
|
|
def fill_a_to_b(a, b): |
|
if len(a) < len(b): |
|
for _ in range(0, len(b) - len(a)): |
|
a.append(a[0]) |
|
|
|
|
|
def mkdir(paths: list): |
|
for path in paths: |
|
if not os.path.exists(path): |
|
os.mkdir(path) |
|
|
|
|
|
class VitsSvc(object): |
|
def __init__(self): |
|
self.device = torch.device( |
|
"cuda" if torch.cuda.is_available() else "cpu") |
|
self.SVCVITS = None |
|
self.hps = None |
|
self.speakers = None |
|
self.hubert_soft = utils.get_hubert_model() |
|
|
|
def set_device(self, device): |
|
self.device = torch.device(device) |
|
self.hubert_soft.to(self.device) |
|
if self.SVCVITS != None: |
|
self.SVCVITS.to(self.device) |
|
|
|
def loadCheckpoint(self, path): |
|
self.hps = utils.get_hparams_from_file( |
|
f"checkpoints/{path}/config.json") |
|
self.SVCVITS = SynthesizerTrn( |
|
self.hps.data.filter_length // 2 + 1, |
|
self.hps.train.segment_size // self.hps.data.hop_length, |
|
**self.hps.model) |
|
_ = utils.load_checkpoint( |
|
f"checkpoints/{path}/model.pth", self.SVCVITS, None) |
|
_ = self.SVCVITS.eval().to(self.device) |
|
self.speakers = self.hps.spk |
|
|
|
def get_units(self, source, sr): |
|
source = source.unsqueeze(0).to(self.device) |
|
with torch.inference_mode(): |
|
units = self.hubert_soft.units(source) |
|
return units |
|
|
|
def get_unit_pitch(self, in_path, tran): |
|
source, sr = torchaudio.load(in_path) |
|
source = torchaudio.functional.resample(source, sr, 16000) |
|
if len(source.shape) == 2 and source.shape[1] >= 2: |
|
source = torch.mean(source, dim=0).unsqueeze(0) |
|
soft = self.get_units(source, sr).squeeze(0).cpu().numpy() |
|
f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran) |
|
return soft, f0 |
|
|
|
def infer(self, speaker_id, tran, raw_path): |
|
speaker_id = self.speakers[speaker_id] |
|
sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0) |
|
soft, pitch = self.get_unit_pitch(raw_path, tran) |
|
f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device) |
|
stn_tst = torch.FloatTensor(soft) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(self.device) |
|
x_tst = torch.repeat_interleave( |
|
x_tst, repeats=2, dim=1).transpose(1, 2) |
|
audio, _ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[ |
|
0, 0].data.float() |
|
return audio, audio.shape[-1] |
|
|
|
def inference(self, srcaudio, chara, tran, slice_db): |
|
sampling_rate, audio = srcaudio |
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) |
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio.transpose(1, 0)) |
|
if sampling_rate != 16000: |
|
audio = librosa.resample( |
|
audio, orig_sr=sampling_rate, target_sr=16000) |
|
soundfile.write("tmpwav.wav", audio, 16000, format="wav") |
|
chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db) |
|
audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks) |
|
audio = [] |
|
for (slice_tag, data) in audio_data: |
|
length = int(np.ceil(len(data) / audio_sr * |
|
self.hps.data.sampling_rate)) |
|
raw_path = io.BytesIO() |
|
soundfile.write(raw_path, data, audio_sr, format="wav") |
|
raw_path.seek(0) |
|
if slice_tag: |
|
_audio = np.zeros(length) |
|
else: |
|
out_audio, out_sr = self.infer(chara, tran, raw_path) |
|
_audio = out_audio.cpu().numpy() |
|
audio.extend(list(_audio)) |
|
audio = (np.array(audio) * 32768.0).astype('int16') |
|
return (self.hps.data.sampling_rate, audio) |
|
|