import tempfile import subprocess import time from typing import Optional from AinaTheme import AinaGradioTheme import gradio as gr import numpy as np import torch import os from TTS.utils.synthesizer import Synthesizer from dotenv import load_dotenv torch.manual_seed(0) np.random.seed(0) # CleanUnet Dependencies import json from copy import deepcopy import numpy as np import torch # from util import print_size, sampling import torchaudio import torchaudio.transforms as T import random random.seed(0) torch.manual_seed(0) np.random.seed(0) SAMPLE_RATE = 8000 CONFIG = "configs/DNS-large-full.json" # CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl" # Parse configs. Globals nicer in this case with open(CONFIG) as f: data = f.read() config = json.loads(data) gen_config = config["gen_config"] global network_config network_config = config["network_config"] # to define wavenet global train_config train_config = config["train_config"] # train config global trainset_config trainset_config = config["trainset_config"] # to read trainset configurations # global use_denoise # use_denoise = False # setup local experiment path exp_path = train_config["exp_path"] print('exp_path:', exp_path) # load data loader_config = deepcopy(trainset_config) loader_config["crop_length_sec"] = 0 ############################################################################################################# load_dotenv() MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500)) # Dynamically read model files, exclude 'speakers.pth' model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth'] model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True) speakers_path = "speakers.pth" speakers_list = torch.load(speakers_path) speakers_list = list(speakers_list.keys()) speakers_list = [speaker for speaker in speakers_list] default_speaker_list = speakers_list # # Filtered lists based on dataset festcat_speakers = [s for s in speakers_list if len(s) == 3] # google_speakers = [s for s in speakers_list if 3 < len(s) < 20] # commonvoice_speakers = [s for s in speakers_list if len(s) > 20] # DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau") model_file = model_files[0] # change this!! model_path = os.path.join(os.getcwd(), model_file) config_path = "config.json" vocoder_path = None vocoder_config_path = None synthesizer = Synthesizer( model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path, ) def get_phonetic_transcription(text: str): try: result = subprocess.run( ['espeak-ng', '--ipa', '-v', 'ca', text], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True ) return result.stdout.strip() except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") return None def tts_inference(text: str, speaker_idx: str = None, use_denoise: int = 0): # synthesize if synthesizer is None: raise NameError("model not found") t1 = time.time() wavs = synthesizer.tts(text, speaker_idx) print(type(wavs)) if use_denoise == 0: wavs_den = torch.Tensor(wavs).unsqueeze(0) # one sample # wavs_den = denoise(wavs_den).tolist() else: wavs_den = wavs # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # wavs must be a list of integers synthesizer.save_wav(wavs, fp) t2 = time.time() - t1 print(round(t2, 2)) output_audio = fp.name with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # wavs must be a list of integers synthesizer.save_wav(wavs_den, fp) output_audio_den = fp.name return output_audio, output_audio_den title = "🗣️ Catalan Multispeaker TTS Tester 🗣️" description = """ 1️⃣ Enter the text to synthesize. 2️⃣ Select a voice from the dropdown menu. 3️⃣ Enjoy! """ def submit_input(input_, speaker_id, use_dn): output_audio = None output_phonetic = None if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN: output_audio, output_audio_den = tts_inference(input_, speaker_id, use_dn) output_phonetic = get_phonetic_transcription(input_) else: gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.") return output_audio, output_audio_den, output_phonetic def change_interactive(text): input_state = text if input_state.strip() != "": return gr.update(interactive=True) else: return gr.update(interactive=False) def clean(): return ( None, None, ) with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app: gr.Markdown(f"