from onnx_modules.V230_OnnxInference import OnnxInferenceSession import numpy as np import torch from scipy.io.wavfile import write from text import cleaned_text_to_sequence, get_bert from text.cleaner import clean_text import utils import commons import uuid from flask import Flask, request, jsonify, render_template_string from flask_cors import CORS import gradio as gr import os from threading import Thread hps = utils.get_hparams_from_file('onnx/BangDreamApi.json') device = 'cpu' BandList = { "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"], "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"], "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"], "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"], "Roselia":["友希那","紗夜","リサ","燐子","あこ"], "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"], "Morfonica":["ましろ","瑠唯","つくし","七深","透子"], "MyGo":["燈","愛音","そよ","立希","楽奈"], "AveMujica":["祥子","睦","海鈴","にゃむ","初華"], "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"], "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"], "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"], "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"] } Session = OnnxInferenceSession( { "enc" : "onnx/BangDreamApi/BangDreamApi_enc_p.onnx", "emb_g" : "onnx/BangDreamApi/BangDreamApi_emb.onnx", "dp" : "onnx/BangDreamApi/BangDreamApi_dp.onnx", "sdp" : "onnx/BangDreamApi/BangDreamApi_sdp.onnx", "flow" : "onnx/BangDreamApi/BangDreamApi_flow.onnx", "dec" : "onnx/BangDreamApi/BangDreamApi_dec.onnx" }, Providers = ["CPUExecutionProvider"] ) def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): style_text = None if style_text == "" else style_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) if True: phone = commons.intersperse(phone, 0) tone = commons.intersperse(tone, 0) language = commons.intersperse(language, 0) for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 bert_ori = get_bert( norm_text, word2ph, language_str, device, style_text, style_weight ) del word2ph assert bert_ori.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert_ori ja_bert = torch.randn(1024, len(phone)) en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": bert = torch.randn(1024, len(phone)) ja_bert = bert_ori en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": bert = torch.randn(1024, len(phone)) ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori else: raise ValueError("language_str should be ZH, JP or EN") assert bert.shape[-1] == len( phone ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) language = torch.LongTensor(language) return bert, ja_bert, en_bert, phone, tone, language def infer( text, sid, style_text=None, style_weight=0.7, sdp_ratio=0.5, noise_scale=0.6, noise_scale_w=0.667, length_scale=1, unique_filename = 'temp.wav' ): language= 'JP' if is_japanese(text) else 'ZH' bert, ja_bert, en_bert, phones, tone, language = get_text( text, language, hps, device, style_text=style_text, style_weight=style_weight, ) with torch.no_grad(): x_tst = phones.unsqueeze(0).to(device).numpy() language = np.zeros_like(x_tst) tone = np.zeros_like(x_tst) bert = bert.to(device).transpose(0, 1).numpy() ja_bert = ja_bert.to(device).transpose(0, 1).numpy() en_bert = en_bert.to(device).transpose(0, 1).numpy() del phones sid = np.array([hps.spk2id[sid]]) audio = Session( x_tst, tone, language, bert, ja_bert, en_bert, sid, seed=114514, seq_noise_scale=noise_scale_w, sdp_noise_scale=noise_scale, length_scale=length_scale, sdp_ratio=sdp_ratio, ) del x_tst, tone, language, bert, ja_bert, en_bert, sid write(unique_filename, 44100, audio) return (44100,gr.processing_utils.convert_to_16_bit_wav(audio)) def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False Flaskapp = Flask(__name__) CORS(Flaskapp) @Flaskapp.route('/') def tts(): global last_text, last_model speaker = request.args.get('speaker') sdp_ratio = float(request.args.get('sdp_ratio', 0.2)) noise_scale = float(request.args.get('noise_scale', 0.6)) noise_scale_w = float(request.args.get('noise_scale_w', 0.8)) length_scale = float(request.args.get('length_scale', 1)) style_weight = float(request.args.get('style_weight', 0.7)) style_text = request.args.get('style_text', 'happy') text = request.args.get('text') is_chat = request.args.get('is_chat', 'false').lower() == 'true' #model = request.args.get('model',modelPaths[-1]) if not speaker or not text: return render_template_string(""" TTS API Documentation """) ''' if model != last_model: unique_filename = loadmodel(model) last_model = model ''' if is_chat and text == last_text: # Generate 1 second of silence and return unique_filename = 'blank.wav' silence = np.zeros(44100, dtype=np.int16) write(unique_filename , 44100, silence) else: last_text = text unique_filename = "temp.wav" infer(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale,sid = speaker, style_text=style_text, style_weight=style_weight,unique_filename=unique_filename) with open(unique_filename ,'rb') as bit: wav_bytes = bit.read() os.remove(unique_filename) headers = { 'Content-Type': 'audio/wav', 'Text': unique_filename .encode('utf-8')} return wav_bytes, 200, headers if __name__ == "__main__": speaker_ids = hps.spk2id speakers = list(speaker_ids.keys()) last_text = "" Flaskapp.run(host="0.0.0.0", port=5000,debug=True)