from engine import Piper import tempfile from typing import Optional from TTS.config import load_config import gradio as gr import numpy as np import os import json from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer MAX_TXT_LEN = 100 SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89'] def carrega_bsc(): model_path = os.getcwd() + "/models/bsc/best_model.pth" config_path = os.getcwd() + "/models/bsc/config.json" speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth" vocoder_path = None vocoder_config_path = None synthesizer = Synthesizer( model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, ) return synthesizer def carrega_collectivat(): model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth" config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json" vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth" vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json" synthesizer = Synthesizer( model_path, config_path, None, None, vocoder_path, vocoder_config_path ) return synthesizer def carrega_piper(): return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx") model_bsc = carrega_bsc() SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names model_collectivat = carrega_collectivat() model_piper = carrega_piper() def tts(text, speaker_idx): if len(text) > MAX_TXT_LEN: text = text[:MAX_TXT_LEN] print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") print(text) # synthesize wav_bsc = model_bsc.tts(text, speaker_idx) wav_coll = model_collectivat.tts(text) wav_piper = model_piper.synthesize(text) return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper) # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: model_bsc.save_wav(wavs, fp) return fp.name description=""" 1️⃣ Introdueix el text a sintetitzar. 2️⃣ Selecciona una veu en el desplegable. 3️⃣ Gaudeix! """ article= "" iface = gr.Interface( fn=tts, inputs=[ gr.Textbox( label="Text", default="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.", ), gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, default=None) ], outputs=[ gr.Audio(label="BSC VITS",type="numpy"), gr.Audio(label="Collectivat Fastspeech",type="numpy"), gr.Audio(label="Piper VITS",type="numpy") ], title="🗣️ TTS Català Multi Parlant - VITS 🗣️", description=description, article=article, allow_flagging="never", layout="vertical", live=False ) iface.launch(server_name="0.0.0.0", server_port=7860)