|
import io |
|
import json |
|
import os |
|
import wave |
|
from dataclasses import dataclass |
|
from pathlib import Path |
|
from typing import List, Mapping, Optional, Sequence, Union |
|
|
|
import numpy as np |
|
import onnxruntime |
|
from espeak_phonemizer import Phonemizer |
|
|
|
_BOS = "^" |
|
_EOS = "$" |
|
_PAD = "_" |
|
|
|
|
|
@dataclass |
|
class PiperConfig: |
|
num_symbols: int |
|
num_speakers: int |
|
sample_rate: int |
|
espeak_voice: str |
|
length_scale: float |
|
noise_scale: float |
|
noise_w: float |
|
phoneme_id_map: Mapping[str, Sequence[int]] |
|
|
|
|
|
class Piper: |
|
def __init__( |
|
self, |
|
model_path: Union[str, Path], |
|
config_path: Optional[Union[str, Path]] = None, |
|
use_cuda: bool = False, |
|
): |
|
if config_path is None: |
|
config_path = f"{model_path}.json" |
|
|
|
self.config = load_config(config_path) |
|
self.phonemizer = Phonemizer(self.config.espeak_voice) |
|
self.onnx_options = onnxruntime.SessionOptions() |
|
self.onnx_options.intra_op_num_threads = os.cpu_count() - 1 |
|
self.model = onnxruntime.InferenceSession( |
|
str(model_path), |
|
sess_options=self.onnx_options, |
|
providers=["CPUExecutionProvider"] |
|
if not use_cuda |
|
else ["CUDAExecutionProvider"], |
|
) |
|
|
|
def synthesize( |
|
self, |
|
text: str, |
|
speaker_id: Optional[int] = None, |
|
length_scale: Optional[float] = None, |
|
noise_scale: Optional[float] = None, |
|
noise_w: Optional[float] = None, |
|
) -> bytes: |
|
"""Synthesize WAV audio from text.""" |
|
if length_scale is None: |
|
length_scale = self.config.length_scale |
|
|
|
if noise_scale is None: |
|
noise_scale = self.config.noise_scale |
|
|
|
if noise_w is None: |
|
noise_w = self.config.noise_w |
|
|
|
phonemes_str = self.phonemizer.phonemize(text, keep_clause_breakers=True) |
|
phonemes = [_BOS] + list(phonemes_str) |
|
phoneme_ids: List[int] = [] |
|
|
|
for phoneme in phonemes: |
|
phoneme_ids.extend(self.config.phoneme_id_map[phoneme]) |
|
phoneme_ids.extend(self.config.phoneme_id_map[_PAD]) |
|
|
|
phoneme_ids.extend(self.config.phoneme_id_map[_EOS]) |
|
|
|
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0) |
|
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64) |
|
scales = np.array( |
|
[noise_scale, length_scale, noise_w], |
|
dtype=np.float32, |
|
) |
|
|
|
if (self.config.num_speakers > 1) and (speaker_id is not None): |
|
|
|
speaker_id = 0 |
|
|
|
sid = None |
|
|
|
if speaker_id is not None: |
|
sid = np.array([speaker_id], dtype=np.int64) |
|
|
|
|
|
audio = self.model.run( |
|
None, |
|
{ |
|
"input": phoneme_ids_array, |
|
"input_lengths": phoneme_ids_lengths, |
|
"scales": scales, |
|
"sid": sid, |
|
}, |
|
)[0].squeeze((0, 1)) |
|
audio = audio_float_to_int16(audio.squeeze()) |
|
|
|
|
|
with io.BytesIO() as wav_io: |
|
wav_file: wave.Wave_write = wave.open(wav_io, "wb") |
|
with wav_file: |
|
wav_file.setframerate(self.config.sample_rate) |
|
wav_file.setsampwidth(2) |
|
wav_file.setnchannels(1) |
|
wav_file.writeframes(audio.tobytes()) |
|
|
|
return wav_io.getvalue() |
|
|
|
|
|
def load_config(config_path: Union[str, Path]) -> PiperConfig: |
|
with open(config_path, "r", encoding="utf-8") as config_file: |
|
config_dict = json.load(config_file) |
|
inference = config_dict.get("inference", {}) |
|
|
|
return PiperConfig( |
|
num_symbols=config_dict["num_symbols"], |
|
num_speakers=config_dict["num_speakers"], |
|
sample_rate=config_dict["audio"]["sample_rate"], |
|
espeak_voice=config_dict["espeak"]["voice"], |
|
noise_scale=inference.get("noise_scale", 0.667), |
|
length_scale=inference.get("length_scale", 1.0), |
|
noise_w=inference.get("noise_w", 0.8), |
|
phoneme_id_map=config_dict["phoneme_id_map"], |
|
) |
|
|
|
|
|
def audio_float_to_int16( |
|
audio: np.ndarray, max_wav_value: float = 32767.0 |
|
) -> np.ndarray: |
|
"""Normalize audio and convert to int16 range""" |
|
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio)))) |
|
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value) |
|
audio_norm = audio_norm.astype("int16") |
|
return audio_norm |
|
|