Spaces:
Runtime error
Runtime error
File size: 5,064 Bytes
d59aeff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from web.api import api_blueprint
from pathlib import Path
from gevent import pywsgi as wsgi
from flask import Flask, Response, request, render_template
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from vocoder.wavernn import inference as rnn_vocoder
import numpy as np
import re
from scipy.io.wavfile import write
import librosa
import io
import base64
from flask_cors import CORS
from flask_wtf import CSRFProtect
import webbrowser
def webApp():
# Init and load config
app = Flask(__name__, instance_relative_config=True)
app.config.from_object("web.config.default")
app.config['RESTPLUS_MASK_SWAGGER'] = False
app.register_blueprint(api_blueprint)
# CORS(app) #允许跨域,注释掉此行则禁止跨域请求
csrf = CSRFProtect(app)
csrf.init_app(app)
syn_models_dirt = "synthesizer/saved_models"
synthesizers = list(Path(syn_models_dirt).glob("**/*.pt"))
synthesizers_cache = {}
encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
rnn_vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))
gan_vocoder.load_model(Path("vocoder/saved_models/pretrained/g_hifigan.pt"))
def pcm2float(sig, dtype='float32'):
"""Convert PCM signal to floating point with a range from -1 to 1.
Use dtype='float32' for single precision.
Parameters
----------
sig : array_like
Input array, must have integral type.
dtype : data type, optional
Desired (floating point) data type.
Returns
-------
numpy.ndarray
Normalized floating point data.
See Also
--------
float2pcm, dtype
"""
sig = np.asarray(sig)
if sig.dtype.kind not in 'iu':
raise TypeError("'sig' must be an array of integers")
dtype = np.dtype(dtype)
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(sig.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig.astype(dtype) - offset) / abs_max
# Cache for synthesizer
@csrf.exempt
@app.route("/api/synthesize", methods=["POST"])
def synthesize():
# TODO Implementation with json to support more platform
# Load synthesizer
if "synt_path" in request.form:
synt_path = request.form["synt_path"]
else:
synt_path = synthesizers[0]
print("NO synthsizer is specified, try default first one.")
if synthesizers_cache.get(synt_path) is None:
current_synt = Synthesizer(Path(synt_path))
synthesizers_cache[synt_path] = current_synt
else:
current_synt = synthesizers_cache[synt_path]
print("using synthesizer model: " + str(synt_path))
# Load input wav
if "upfile_b64" in request.form:
wav_base64 = request.form["upfile_b64"]
wav = base64.b64decode(bytes(wav_base64, 'utf-8'))
wav = pcm2float(np.frombuffer(wav, dtype=np.int16), dtype=np.float32)
sample_rate = Synthesizer.sample_rate
else:
wav, sample_rate, = librosa.load(request.files['file'])
write("temp.wav", sample_rate, wav) #Make sure we get the correct wav
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
# Load input text
texts = filter(None, request.form["text"].split("\n"))
punctuation = '!,。、,' # punctuate and split/clean text
processed_texts = []
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
processed_texts.append(processed_text.strip())
texts = processed_texts
# synthesize and vocode
embeds = [embed] * len(texts)
specs = current_synt.synthesize_spectrograms(texts, embeds)
spec = np.concatenate(specs, axis=1)
sample_rate = Synthesizer.sample_rate
if "vocoder" in request.form and request.form["vocoder"] == "WaveRNN":
wav, sample_rate = rnn_vocoder.infer_waveform(spec)
else:
wav, sample_rate = gan_vocoder.infer_waveform(spec)
# Return cooked wav
out = io.BytesIO()
write(out, sample_rate, wav.astype(np.float32))
return Response(out, mimetype="audio/wav")
@app.route('/', methods=['GET'])
def index():
return render_template("index.html")
host = app.config.get("HOST")
port = app.config.get("PORT")
web_address = 'http://{}:{}'.format(host, port)
print(f"Web server:" + web_address)
webbrowser.open(web_address)
server = wsgi.WSGIServer((host, port), app)
server.serve_forever()
return app
if __name__ == "__main__":
webApp()
|