File size: 5,064 Bytes
d59aeff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from web.api import api_blueprint
from pathlib import Path
from gevent import pywsgi as wsgi
from flask import Flask, Response, request, render_template
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from vocoder.wavernn import inference as rnn_vocoder
import numpy as np
import re
from scipy.io.wavfile import write
import librosa
import io
import base64
from flask_cors import CORS
from flask_wtf import CSRFProtect
import webbrowser

def webApp():
    # Init and load config
    app = Flask(__name__, instance_relative_config=True)
    app.config.from_object("web.config.default")
    app.config['RESTPLUS_MASK_SWAGGER'] = False
    app.register_blueprint(api_blueprint)

    # CORS(app) #允许跨域,注释掉此行则禁止跨域请求
    csrf = CSRFProtect(app)
    csrf.init_app(app)
   
    syn_models_dirt = "synthesizer/saved_models"
    synthesizers = list(Path(syn_models_dirt).glob("**/*.pt"))
    synthesizers_cache = {}
    encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
    rnn_vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))
    gan_vocoder.load_model(Path("vocoder/saved_models/pretrained/g_hifigan.pt"))

    def pcm2float(sig, dtype='float32'):
        """Convert PCM signal to floating point with a range from -1 to 1.
        Use dtype='float32' for single precision.
        Parameters
        ----------
        sig : array_like
            Input array, must have integral type.
        dtype : data type, optional
            Desired (floating point) data type.
        Returns
        -------
        numpy.ndarray
            Normalized floating point data.
        See Also
        --------
        float2pcm, dtype
        """
        sig = np.asarray(sig)
        if sig.dtype.kind not in 'iu':
            raise TypeError("'sig' must be an array of integers")
        dtype = np.dtype(dtype)
        if dtype.kind != 'f':
            raise TypeError("'dtype' must be a floating point type")

        i = np.iinfo(sig.dtype)
        abs_max = 2 ** (i.bits - 1)
        offset = i.min + abs_max
        return (sig.astype(dtype) - offset) / abs_max

    # Cache for synthesizer
    @csrf.exempt
    @app.route("/api/synthesize", methods=["POST"])
    def synthesize():
        # TODO Implementation with json to support more platform
        # Load synthesizer
        if "synt_path" in request.form:
            synt_path = request.form["synt_path"]
        else:
            synt_path = synthesizers[0]
            print("NO synthsizer is specified, try default first one.")
        if synthesizers_cache.get(synt_path) is None:
            current_synt = Synthesizer(Path(synt_path))
            synthesizers_cache[synt_path] = current_synt
        else:
            current_synt = synthesizers_cache[synt_path]
        print("using synthesizer model: " + str(synt_path))
        # Load input wav
        if "upfile_b64" in request.form:
            wav_base64 = request.form["upfile_b64"]
            wav = base64.b64decode(bytes(wav_base64, 'utf-8'))
            wav = pcm2float(np.frombuffer(wav, dtype=np.int16), dtype=np.float32)
            sample_rate = Synthesizer.sample_rate
        else:
            wav, sample_rate,  = librosa.load(request.files['file'])
        write("temp.wav", sample_rate, wav) #Make sure we get the correct wav
        
        encoder_wav = encoder.preprocess_wav(wav, sample_rate)
        embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
        
        # Load input text
        texts = filter(None, request.form["text"].split("\n"))
        punctuation = '!,。、,' # punctuate and split/clean text
        processed_texts = []
        for text in texts:
            for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
                if processed_text:
                    processed_texts.append(processed_text.strip())
        texts = processed_texts

        # synthesize and vocode
        embeds = [embed] * len(texts)
        specs = current_synt.synthesize_spectrograms(texts, embeds)
        spec = np.concatenate(specs, axis=1)
        sample_rate = Synthesizer.sample_rate
        if "vocoder" in request.form and request.form["vocoder"] == "WaveRNN":
            wav, sample_rate = rnn_vocoder.infer_waveform(spec)
        else:
            wav, sample_rate = gan_vocoder.infer_waveform(spec)

        # Return cooked wav
        out = io.BytesIO()
        write(out, sample_rate, wav.astype(np.float32))
        return Response(out, mimetype="audio/wav")

    @app.route('/', methods=['GET'])
    def index():
        return render_template("index.html")

    host = app.config.get("HOST")
    port = app.config.get("PORT")
    web_address = 'http://{}:{}'.format(host, port)
    print(f"Web server:" + web_address)
    webbrowser.open(web_address)
    server = wsgi.WSGIServer((host, port), app)
    server.serve_forever()
    return app

if __name__ == "__main__":
    webApp()