import os from paddlespeech.cli.tts.infer import TTSExecutor """ PaddleSpeech 声码器说明:这里预制了三种声码器【PWGan】【WaveRnn】【HifiGan】, 三种声码器效果和生成时间有比较大的差距,请跟进自己的需要进行选择。不过只选择了前两种,因为WaveRNN太慢了 | 声码器 | 音频质量 | 生成速度 | | :----: | :----: | :----: | | PWGan | 中等 | 中等 | | WaveRnn | 高 | 非常慢(耐心等待) | | HifiGan | 低 | 快 | 这些PaddleSpeech中的样例主要按数据集分类,我们主要使用的TTS数据集有: CSMCS (普通话单发音人) AISHELL3 (普通话多发音人) LJSpeech (英文单发音人) VCTK (英文多发音人) PaddleSpeech 的 TTS 模型具有以下映射关系: tts0 - Tacotron2 tts1 - TransformerTTS tts2 - SpeedySpeech tts3 - FastSpeech2 voc0 - WaveFlow voc1 - Parallel WaveGAN voc2 - MelGAN voc3 - MultiBand MelGAN voc4 - Style MelGAN voc5 - HiFiGAN vc0 - Tacotron2 Voice Clone with GE2E vc1 - FastSpeech2 Voice Clone with GE2E 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表: - 声学模型 | 模型 | 语言 | | :--- | :---: | | speedyspeech_csmsc | zh | | fastspeech2_csmsc | zh | | fastspeech2_ljspeech | en | | fastspeech2_aishell3 | zh | | fastspeech2_vctk | en | | fastspeech2_cnndecoder_csmsc | zh | | fastspeech2_mix | mix | | tacotron2_csmsc | zh | | tacotron2_ljspeech | en | | fastspeech2_male | zh | | fastspeech2_male | en | | fastspeech2_male | mix | | fastspeech2_canton | canton | - 声码器 | 模型 | 语言 | | :--- | :---: | | pwgan_csmsc | zh | | pwgan_ljspeech | en | | pwgan_aishell3 | zh | | pwgan_vctk | en | | mb_melgan_csmsc | zh | | style_melgan_csmsc | zh | | hifigan_csmsc | zh | | hifigan_ljspeech | en | | hifigan_aishell3 | zh | | hifigan_vctk | en | | wavernn_csmsc | zh | | pwgan_male | zh | | hifigan_male | zh | """ class PaddleTTS: def __init__(self) -> None: pass def predict(self, text, am, voc, spk_id = 174, lang = 'zh', male=False, save_path = 'output.wav'): self.tts = TTSExecutor() use_onnx = True voc = voc.lower() am = am.lower() if male: assert voc in ["pwgan", "hifigan"], "male voc must be 'pwgan' or 'hifigan'" wav_file = self.tts( text = text, output = save_path, am='fastspeech2_male', voc= voc + '_male', lang=lang, use_onnx=use_onnx ) return wav_file assert am in ['tacotron2', 'fastspeech2'], "am must be 'tacotron2' or 'fastspeech2'" # 混合中文英文语音合成 if lang == 'mix': # mix只有fastspeech2 am = 'fastspeech2_mix' voc += '_csmsc' # 英文语音合成 elif lang == 'en': am += '_ljspeech' voc += '_ljspeech' # 中文语音合成 elif lang == 'zh': assert voc in ['wavernn', 'pwgan', 'hifigan', 'style_melgan', 'mb_melgan'], "voc must be 'wavernn' or 'pwgan' or 'hifigan' or 'style_melgan' or 'mb_melgan'" am += '_csmsc' voc += '_csmsc' elif lang == 'canton': am = 'fastspeech2_canton' voc = 'pwgan_aishell3' spk_id = 10 print("am:", am, "voc:", voc, "lang:", lang, "male:", male, "spk_id:", spk_id) try: cmd = f'paddlespeech tts --am {am} --voc {voc} --input "{text}" --output {save_path} --lang {lang} --spk_id {spk_id} --use_onnx {use_onnx}' os.system(cmd) wav_file = save_path except: # 语音合成 wav_file = self.tts( text = text, output = save_path, am = am, voc = voc, lang = lang, spk_id = spk_id, use_onnx=use_onnx ) return wav_file if __name__ == "__main__": tts = PaddleTTS() tts.predict("Hello world", 'FastSpeech2', 'PWGan', spk_id=174, lang='en', male=False, save_path='output.wav')