update app
Browse files- Dockerfile +3 -1
- app.py +21 -17
- engine.py +144 -0
Dockerfile
CHANGED
@@ -11,11 +11,13 @@ RUN cd espeak-ng && \
|
|
11 |
make install
|
12 |
|
13 |
COPY requirements.txt .
|
14 |
-
COPY app.py .
|
15 |
COPY models .
|
16 |
|
17 |
RUN pip install -r requirements.txt
|
18 |
|
|
|
|
|
|
|
19 |
RUN mkdir -p cache && chmod 777 cache
|
20 |
|
21 |
ENV NUMBA_CACHE_DIR=./cache
|
|
|
11 |
make install
|
12 |
|
13 |
COPY requirements.txt .
|
|
|
14 |
COPY models .
|
15 |
|
16 |
RUN pip install -r requirements.txt
|
17 |
|
18 |
+
COPY engine.py .
|
19 |
+
COPY app.py .
|
20 |
+
|
21 |
RUN mkdir -p cache && chmod 777 cache
|
22 |
|
23 |
ENV NUMBA_CACHE_DIR=./cache
|
app.py
CHANGED
@@ -7,42 +7,48 @@ import os
|
|
7 |
import json
|
8 |
from TTS.utils.manage import ModelManager
|
9 |
from TTS.utils.synthesizer import Synthesizer
|
10 |
-
|
11 |
|
12 |
MAX_TXT_LEN = 100
|
13 |
|
14 |
SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def tts(text, speaker_idx):
|
17 |
if len(text) > MAX_TXT_LEN:
|
18 |
text = text[:MAX_TXT_LEN]
|
19 |
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
20 |
print(text)
|
21 |
|
22 |
-
model_path = os.getcwd() + "/best_model.pth"
|
23 |
-
config_path = os.getcwd() + "/config.json"
|
24 |
-
speakers_file_path = os.getcwd() + "/speakers.pth"
|
25 |
speakers_maping_path = os.getcwd() + "/speaker_map.json"
|
26 |
-
vocoder_path = None
|
27 |
-
vocoder_config_path = None
|
28 |
-
|
29 |
-
synthesizer = Synthesizer(
|
30 |
-
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
|
31 |
-
)
|
32 |
|
33 |
# Map speaker aliases to speaker ids
|
34 |
with open(speakers_maping_path, 'r') as fp:
|
35 |
maping = json.load(fp)
|
36 |
|
37 |
-
speaker_idx = maping[speaker_idx]
|
38 |
|
39 |
# synthesize
|
40 |
-
|
41 |
-
raise NameError("model not found")
|
42 |
-
wavs = synthesizer.tts(text, speaker_idx)
|
43 |
# return output
|
44 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
45 |
-
|
46 |
return fp.name
|
47 |
|
48 |
|
@@ -66,11 +72,9 @@ iface = gr.Interface(
|
|
66 |
],
|
67 |
outputs=gr.outputs.Audio(label="Output",type="filepath"),
|
68 |
title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
|
69 |
-
theme="grass",
|
70 |
description=description,
|
71 |
article=article,
|
72 |
allow_flagging="never",
|
73 |
-
flagging_options=['error', 'bad-quality', 'wrong-pronounciation'],
|
74 |
layout="vertical",
|
75 |
live=False
|
76 |
)
|
|
|
7 |
import json
|
8 |
from TTS.utils.manage import ModelManager
|
9 |
from TTS.utils.synthesizer import Synthesizer
|
10 |
+
from .engine import Piper
|
11 |
|
12 |
MAX_TXT_LEN = 100
|
13 |
|
14 |
SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
|
15 |
|
16 |
+
def carrega_bsc():
|
17 |
+
model_path = os.getcwd() + "/models/bsc/best_model.pth"
|
18 |
+
config_path = os.getcwd() + "/models/bsc/config.json"
|
19 |
+
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
|
20 |
+
vocoder_path = None
|
21 |
+
vocoder_config_path = None
|
22 |
+
|
23 |
+
synthesizer = Synthesizer(
|
24 |
+
model_path, config_path, speakers_file_path, No$
|
25 |
+
)
|
26 |
+
|
27 |
+
return synthesizer
|
28 |
+
|
29 |
+
model_bsc = carrega_bsc()
|
30 |
+
SPEAKERS = model_bsc.speakers
|
31 |
+
|
32 |
+
|
33 |
def tts(text, speaker_idx):
|
34 |
if len(text) > MAX_TXT_LEN:
|
35 |
text = text[:MAX_TXT_LEN]
|
36 |
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
37 |
print(text)
|
38 |
|
|
|
|
|
|
|
39 |
speakers_maping_path = os.getcwd() + "/speaker_map.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Map speaker aliases to speaker ids
|
42 |
with open(speakers_maping_path, 'r') as fp:
|
43 |
maping = json.load(fp)
|
44 |
|
45 |
+
#speaker_idx = maping[speaker_idx]
|
46 |
|
47 |
# synthesize
|
48 |
+
wavs = model_bsc.tts(text, speaker_idx)
|
|
|
|
|
49 |
# return output
|
50 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
51 |
+
model_bsc.save_wav(wavs, fp)
|
52 |
return fp.name
|
53 |
|
54 |
|
|
|
72 |
],
|
73 |
outputs=gr.outputs.Audio(label="Output",type="filepath"),
|
74 |
title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
|
|
|
75 |
description=description,
|
76 |
article=article,
|
77 |
allow_flagging="never",
|
|
|
78 |
layout="vertical",
|
79 |
live=False
|
80 |
)
|
engine.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import wave
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import List, Mapping, Optional, Sequence, Union
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import onnxruntime
|
11 |
+
from espeak_phonemizer import Phonemizer
|
12 |
+
|
13 |
+
_BOS = "^"
|
14 |
+
_EOS = "$"
|
15 |
+
_PAD = "_"
|
16 |
+
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class PiperConfig:
|
20 |
+
num_symbols: int
|
21 |
+
num_speakers: int
|
22 |
+
sample_rate: int
|
23 |
+
espeak_voice: str
|
24 |
+
length_scale: float
|
25 |
+
noise_scale: float
|
26 |
+
noise_w: float
|
27 |
+
phoneme_id_map: Mapping[str, Sequence[int]]
|
28 |
+
|
29 |
+
|
30 |
+
class Piper:
|
31 |
+
def __init__(
|
32 |
+
self,
|
33 |
+
model_path: Union[str, Path],
|
34 |
+
config_path: Optional[Union[str, Path]] = None,
|
35 |
+
use_cuda: bool = False,
|
36 |
+
):
|
37 |
+
if config_path is None:
|
38 |
+
config_path = f"{model_path}.json"
|
39 |
+
|
40 |
+
self.config = load_config(config_path)
|
41 |
+
self.phonemizer = Phonemizer(self.config.espeak_voice)
|
42 |
+
self.onnx_options = onnxruntime.SessionOptions()
|
43 |
+
self.onnx_options.intra_op_num_threads = os.cpu_count() - 1
|
44 |
+
self.model = onnxruntime.InferenceSession(
|
45 |
+
str(model_path),
|
46 |
+
sess_options=self.onnx_options,
|
47 |
+
providers=["CPUExecutionProvider"]
|
48 |
+
if not use_cuda
|
49 |
+
else ["CUDAExecutionProvider"],
|
50 |
+
)
|
51 |
+
|
52 |
+
def synthesize(
|
53 |
+
self,
|
54 |
+
text: str,
|
55 |
+
speaker_id: Optional[int] = None,
|
56 |
+
length_scale: Optional[float] = None,
|
57 |
+
noise_scale: Optional[float] = None,
|
58 |
+
noise_w: Optional[float] = None,
|
59 |
+
) -> bytes:
|
60 |
+
"""Synthesize WAV audio from text."""
|
61 |
+
if length_scale is None:
|
62 |
+
length_scale = self.config.length_scale
|
63 |
+
|
64 |
+
if noise_scale is None:
|
65 |
+
noise_scale = self.config.noise_scale
|
66 |
+
|
67 |
+
if noise_w is None:
|
68 |
+
noise_w = self.config.noise_w
|
69 |
+
|
70 |
+
phonemes_str = self.phonemizer.phonemize(text)
|
71 |
+
phonemes = [_BOS] + list(phonemes_str)
|
72 |
+
phoneme_ids: List[int] = []
|
73 |
+
|
74 |
+
for phoneme in phonemes:
|
75 |
+
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
|
76 |
+
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
|
77 |
+
|
78 |
+
phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
|
79 |
+
|
80 |
+
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
81 |
+
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
82 |
+
scales = np.array(
|
83 |
+
[noise_scale, length_scale, noise_w],
|
84 |
+
dtype=np.float32,
|
85 |
+
)
|
86 |
+
|
87 |
+
if (self.config.num_speakers > 1) and (speaker_id is not None):
|
88 |
+
# Default speaker
|
89 |
+
speaker_id = 0
|
90 |
+
|
91 |
+
sid = None
|
92 |
+
|
93 |
+
if speaker_id is not None:
|
94 |
+
sid = np.array([speaker_id], dtype=np.int64)
|
95 |
+
|
96 |
+
# Synthesize through Onnx
|
97 |
+
audio = self.model.run(
|
98 |
+
None,
|
99 |
+
{
|
100 |
+
"input": phoneme_ids_array,
|
101 |
+
"input_lengths": phoneme_ids_lengths,
|
102 |
+
"scales": scales,
|
103 |
+
"sid": sid,
|
104 |
+
},
|
105 |
+
)[0].squeeze((0, 1))
|
106 |
+
audio = audio_float_to_int16(audio.squeeze())
|
107 |
+
|
108 |
+
# Convert to WAV
|
109 |
+
with io.BytesIO() as wav_io:
|
110 |
+
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
|
111 |
+
with wav_file:
|
112 |
+
wav_file.setframerate(self.config.sample_rate)
|
113 |
+
wav_file.setsampwidth(2)
|
114 |
+
wav_file.setnchannels(1)
|
115 |
+
wav_file.writeframes(audio.tobytes())
|
116 |
+
|
117 |
+
return wav_io.getvalue()
|
118 |
+
|
119 |
+
|
120 |
+
def load_config(config_path: Union[str, Path]) -> PiperConfig:
|
121 |
+
with open(config_path, "r", encoding="utf-8") as config_file:
|
122 |
+
config_dict = json.load(config_file)
|
123 |
+
inference = config_dict.get("inference", {})
|
124 |
+
|
125 |
+
return PiperConfig(
|
126 |
+
num_symbols=config_dict["num_symbols"],
|
127 |
+
num_speakers=config_dict["num_speakers"],
|
128 |
+
sample_rate=config_dict["audio"]["sample_rate"],
|
129 |
+
espeak_voice=config_dict["espeak"]["voice"],
|
130 |
+
noise_scale=inference.get("noise_scale", 0.667),
|
131 |
+
length_scale=inference.get("length_scale", 1.0),
|
132 |
+
noise_w=inference.get("noise_w", 0.8),
|
133 |
+
phoneme_id_map=config_dict["phoneme_id_map"],
|
134 |
+
)
|
135 |
+
|
136 |
+
|
137 |
+
def audio_float_to_int16(
|
138 |
+
audio: np.ndarray, max_wav_value: float = 32767.0
|
139 |
+
) -> np.ndarray:
|
140 |
+
"""Normalize audio and convert to int16 range"""
|
141 |
+
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
142 |
+
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
143 |
+
audio_norm = audio_norm.astype("int16")
|
144 |
+
return audio_norm
|