from onnx_modules.V230_OnnxInference import OnnxInferenceSession import numpy as np import torch from scipy.io.wavfile import write from text import cleaned_text_to_sequence, get_bert from text.cleaner import clean_text import utils import commons hps = utils.get_hparams_from_file('onnx/BangDreamApi.json') device = 'cpu' Session = OnnxInferenceSession( { "enc" : "onnx/BangDreamApi/BangDreamApi_enc_p.onnx", "emb_g" : "onnx/BangDreamApi/BangDreamApi_emb.onnx", "dp" : "onnx/BangDreamApi/BangDreamApi_dp.onnx", "sdp" : "onnx/BangDreamApi/BangDreamApi_sdp.onnx", "flow" : "onnx/BangDreamApi/BangDreamApi_flow.onnx", "dec" : "onnx/BangDreamApi/BangDreamApi_dec.onnx" }, Providers = ["CPUExecutionProvider"] ) def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): style_text = None if style_text == "" else style_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) if True: phone = commons.intersperse(phone, 0) tone = commons.intersperse(tone, 0) language = commons.intersperse(language, 0) for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 bert_ori = get_bert( norm_text, word2ph, language_str, device, style_text, style_weight ) del word2ph assert bert_ori.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert_ori ja_bert = torch.randn(1024, len(phone)) en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": bert = torch.randn(1024, len(phone)) ja_bert = bert_ori en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": bert = torch.randn(1024, len(phone)) ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori else: raise ValueError("language_str should be ZH, JP or EN") assert bert.shape[-1] == len( phone ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) language = torch.LongTensor(language) return bert, ja_bert, en_bert, phone, tone, language def infer( text, sid, style_text=None, style_weight=0.7, sdp_ratio=0.5, noise_scale=0.6, noise_scale_w=0.667, length_scale=1, ): language= 'JP' if is_japanese(text) else 'ZH' bert, ja_bert, en_bert, phones, tone, language = get_text( text, language, hps, device, style_text=style_text, style_weight=style_weight, ) with torch.no_grad(): x_tst = phones.unsqueeze(0).to(device).numpy() language = np.zeros_like(x_tst) tone = np.zeros_like(x_tst) bert = bert.to(device).transpose(0, 1).numpy() ja_bert = ja_bert.to(device).transpose(0, 1).numpy() en_bert = en_bert.to(device).transpose(0, 1).numpy() del phones sid = np.array([hps.spk2id[sid]]) audio = Session( x_tst, tone, language, bert, ja_bert, en_bert, sid, seed=114514, seq_noise_scale=noise_scale_w, sdp_noise_scale=noise_scale, length_scale=length_scale, sdp_ratio=sdp_ratio, ) del x_tst, tone, language, bert, ja_bert, en_bert, sid write('temp.wav', 44100, audio) def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False if __name__ == "__main__": infer("你好,我是说的道理", "パレオ") ''' from onnx_modules.V230_OnnxInference import OnnxInferenceSession import numpy as np import torch from scipy.io.wavfile import write from text import cleaned_text_to_sequence, get_bert from text.cleaner import clean_text import utils import commons hps = utils.get_hparams_from_file('onnx/BangDreamApi.json') device = 'cpu' Session = OnnxInferenceSession( { "enc" : "onnx/BangDreamApi/BangDreamApi_enc_p.onnx", "emb_g" : "onnx/BangDreamApi/BangDreamApi_emb.onnx", "dp" : "onnx/BangDreamApi/BangDreamApi_dp.onnx", "sdp" : "onnx/BangDreamApi/BangDreamApi_sdp.onnx", "flow" : "onnx/BangDreamApi/BangDreamApi_flow.onnx", "dec" : "onnx/BangDreamApi/BangDreamApi_dec.onnx" }, Providers = ["CPUExecutionProvider"] ) def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): style_text = None if style_text == "" else style_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) if True: phone = commons.intersperse(phone, 0) tone = commons.intersperse(tone, 0) language = commons.intersperse(language, 0) for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 bert_ori = get_bert( norm_text, word2ph, language_str, device, style_text, style_weight ) del word2ph assert bert_ori.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert_ori ja_bert = torch.randn(1024, len(phone)) en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": bert = torch.randn(1024, len(phone)) ja_bert = bert_ori en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": bert = torch.randn(1024, len(phone)) ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori else: raise ValueError("language_str should be ZH, JP or EN") assert bert.shape[-1] == len( phone ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) language = torch.LongTensor(language) return bert, ja_bert, en_bert, phone, tone, language def infer( text, sid, style_text=None, style_weight=0.7, ): language= 'JP' if is_japanese(text) else 'ZH' bert, ja_bert, en_bert, phones, tone, language = get_text( text, language, hps, "cpu", style_text=style_text, style_weight=style_weight, ) with torch.no_grad(): x_tst = phones.unsqueeze(0).to(device).numpy() tone = tone.to(device).unsqueeze(0).numpy() bert = bert.to(device).transpose(0, 1).numpy() ja_bert = ja_bert.to(device).transpose(0, 1).numpy() en_bert = en_bert.to(device).transpose(0, 1).numpy() del phones language = np.zeros_like(x_tst) tone = np.zeros_like(x_tst) print(bert) print(tone) print(ja_bert) print(language) sid = np.array([0]) audio = Session( x_tst, tone, language, bert, ja_bert, en_bert, sid ) write('temp.wav', 44100, audio) def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False if __name__ == "__main__": infer("你好,我是说的道理", "香澄") '''