import tempfile import subprocess import time from typing import Optional from AinaTheme import AinaGradioTheme import gradio as gr import numpy as np import torch import os from TTS.utils.synthesizer import Synthesizer from dotenv import load_dotenv torch.manual_seed(0) np.random.seed(0) import json from copy import deepcopy import numpy as np import torch import torchaudio import torchaudio.transforms as T import random random.seed(0) torch.manual_seed(0) np.random.seed(0) SAMPLE_RATE = 8000 ############################################################################################################# load_dotenv() MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500)) # Dynamically read model files, exclude 'speakers.pth' # model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth'] model_files = [f for f in os.listdir(os.path.join(os.getcwd(), 'checkpoints')) if f.endswith('.pth')] # model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True) speakers_path = "speakers.pth" speakers_list = torch.load(speakers_path) speakers_list = list(speakers_list.keys()) speakers_list = [speaker for speaker in speakers_list] default_speaker_list = speakers_list # # Filtered lists based on dataset festcat_speakers = [s for s in speakers_list if len(s) == 3] # google_speakers = [s for s in speakers_list if 3 < len(s) < 20] # commonvoice_speakers = [s for s in speakers_list if len(s) > 20] # DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau") DEFAULT_CHECKPOINT = os.environ.get("DEFAULT_CHECKPOINT", default=model_files[0]) model_file = model_files[0] # change this!! model_path = os.path.join(os.getcwd(), model_file) config_path = "config.json" vocoder_path = None vocoder_config_path = None synthesizer = Synthesizer( model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path, ) def get_phonetic_transcription(text: str): try: result = subprocess.run( ['espeak-ng', '--ipa', '-v', 'ca', text], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True ) return result.stdout.strip() except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") return None def tts_inference(text: str, speaker_idx: str = None): # synthesize if synthesizer is None: raise NameError("model not found") t1 = time.time() wavs = synthesizer.tts(text, speaker_idx) # print(type(wavs)) wavs_den = wavs # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # wavs must be a list of integers synthesizer.save_wav(wavs_den, fp) t2 = time.time() - t1 print(round(t2, 2)) output_audio = fp.name return output_audio title = "🗣️ Catalan Multispeaker TTS Tester 🗣️" description = """ 1️⃣ Enter the text to synthesize. 2️⃣ Select a voice from the dropdown menu. 3️⃣ Enjoy! """ def submit_input(input_, speaker_id): output_audio = None output_phonetic = None if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN: output_audio = tts_inference(input_, speaker_id) output_phonetic = get_phonetic_transcription(input_) else: gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.") return output_audio, output_phonetic def change_interactive(text): input_state = text if input_state.strip() != "": return gr.update(interactive=True) else: return gr.update(interactive=False) def clean(): return ( None, None, ) with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app: gr.Markdown(f"

{title}

") gr.Markdown(description) with gr.Row(equal_height=False): with gr.Column(variant='panel'): input_ = gr.Textbox( label="Text", value="Introdueix el text a sintetitzar.", lines=4 ) dataset = gr.Radio(["All", "Festcat", "Google TTS", "CommonVoice"], label="Speakers Dataset", value="All") def update_speaker_list(dataset): print("Updating speaker list based on dataset:", dataset) if dataset == "Festcat": current_speakers = festcat_speakers elif dataset == "Google TTS": current_speakers = google_speakers elif dataset == "CommonVoice": current_speakers = commonvoice_speakers else: current_speakers = speakers_list return gr.update(choices=current_speakers, value=current_speakers[0]) speaker_id = gr.Dropdown(label="Select a voice", choices=speakers_list, value=DEFAULT_SPEAKER_ID, interactive=True) model_chkpt = gr.Dropdown(label="Select a checkpoint", choices=model_files, value=DEFAULT_CHECKPOINT, interactive=True) dataset.change(fn=update_speaker_list, inputs=dataset, outputs=speaker_id) # model = gr.Dropdown(label="Select a model", choices=model_files, value=DEFAULT_MODEL_FILE_NAME) with gr.Row(): clear_btn = gr.ClearButton(value='Clean', components=[input_]) # clear_btn = gr.Button( # "Clean", # ) submit_btn = gr.Button( "Submit", variant="primary", ) # use_denoise = gr.Radio(choices=[("Yes", 0), ("No", 1)], value=0) with gr.Column(variant='panel'): output_audio = gr.Audio(label="Output", type="filepath", autoplay=True, show_share_button=False) # output_audio_den = gr.Audio(label="Output denoised", type="filepath", autoplay=False, show_share_button=False) output_phonetic = gr.Textbox(label="Phonetic Transcription", readonly=True) for button in [submit_btn]: # clear_btn input_.change(fn=change_interactive, inputs=[input_], outputs=button) # clear_btn.click(fn=clean, inputs=[], outputs=[input_, output_audio, output_phonetic], queue=False) submit_btn.click(fn=submit_input, inputs=[input_, speaker_id], outputs=[output_audio, output_phonetic]) app.queue(concurrency_count=1, api_open=False) app.launch(show_api=False, server_name="0.0.0.0", server_port=7860)