Spaces:

AlexK-PL
/

vits-v2-8khz-inference

Runtime error

App Files Files Community

vits-v2-8khz-inference / app.py

AlexK-PL

Create app.py

9a5d905 verified 10 months ago

raw

history blame

7.83 kB

	import tempfile
	import subprocess
	import time

	from typing import Optional
	from AinaTheme import AinaGradioTheme
	import gradio as gr
	import numpy as np
	import torch
	import os
	from TTS.utils.synthesizer import Synthesizer

	from dotenv import load_dotenv

	torch.manual_seed(0)
	np.random.seed(0)

	# CleanUnet Dependencies

	import json
	from copy import deepcopy

	import numpy as np
	import torch

	# from util import print_size, sampling

	import torchaudio
	import torchaudio.transforms as T

	import random

	random.seed(0)
	torch.manual_seed(0)
	np.random.seed(0)

	SAMPLE_RATE = 8000

	CONFIG = "configs/DNS-large-full.json"
	# CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl"

	# Parse configs. Globals nicer in this case
	with open(CONFIG) as f:
	data = f.read()
	config = json.loads(data)
	gen_config = config["gen_config"]
	global network_config
	network_config = config["network_config"] # to define wavenet
	global train_config
	train_config = config["train_config"] # train config
	global trainset_config
	trainset_config = config["trainset_config"] # to read trainset configurations

	# global use_denoise
	# use_denoise = False

	# setup local experiment path
	exp_path = train_config["exp_path"]
	print('exp_path:', exp_path)

	# load data
	loader_config = deepcopy(trainset_config)
	loader_config["crop_length_sec"] = 0

	#############################################################################################################

	load_dotenv()

	MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500))

	# Dynamically read model files, exclude 'speakers.pth'
	model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth']
	model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True)

	speakers_path = "speakers.pth"
	speakers_list = torch.load(speakers_path)
	speakers_list = list(speakers_list.keys())
	speakers_list = [speaker for speaker in speakers_list]

	default_speaker_list = speakers_list #

	# Filtered lists based on dataset
	festcat_speakers = [s for s in speakers_list if len(s) == 3] #
	google_speakers = [s for s in speakers_list if 3 < len(s) < 20] #
	commonvoice_speakers = [s for s in speakers_list if len(s) > 20] #

	DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau")
	model_file = model_files[0] # change this!!

	model_path = os.path.join(os.getcwd(), model_file)
	config_path = "config.json"

	vocoder_path = None
	vocoder_config_path = None

	synthesizer = Synthesizer(
	model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path,
	)


	def get_phonetic_transcription(text: str):
	try:
	result = subprocess.run(
	['espeak-ng', '--ipa', '-v', 'ca', text],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	check=True
	)
	return result.stdout.strip()
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")
	return None


	def tts_inference(text: str, speaker_idx: str = None, use_denoise: int = 0):
	# synthesize
	if synthesizer is None:
	raise NameError("model not found")
	t1 = time.time()
	wavs = synthesizer.tts(text, speaker_idx)
	print(type(wavs))
	if use_denoise == 0:
	wavs_den = torch.Tensor(wavs).unsqueeze(0) # one sample
	# wavs_den = denoise(wavs_den).tolist()
	else:
	wavs_den = wavs

	# return output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# wavs must be a list of integers
	synthesizer.save_wav(wavs, fp)
	t2 = time.time() - t1
	print(round(t2, 2))
	output_audio = fp.name

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# wavs must be a list of integers
	synthesizer.save_wav(wavs_den, fp)
	output_audio_den = fp.name

	return output_audio, output_audio_den


	title = "🗣️ Catalan Multispeaker TTS Tester 🗣️"
	description = """
	1️⃣ Enter the text to synthesize.
	2️⃣ Select a voice from the dropdown menu.
	3️⃣ Enjoy!
	"""


	def submit_input(input_, speaker_id, use_dn):
	output_audio = None
	output_phonetic = None
	if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN:
	output_audio, output_audio_den = tts_inference(input_, speaker_id, use_dn)
	output_phonetic = get_phonetic_transcription(input_)
	else:
	gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.")
	return output_audio, output_audio_den, output_phonetic


	def change_interactive(text):
	input_state = text
	if input_state.strip() != "":
	return gr.update(interactive=True)
	else:
	return gr.update(interactive=False)


	def clean():
	return (
	None,
	None,
	)


	with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
	gr.Markdown(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
	gr.Markdown(description)

	with gr.Row(equal_height=False):

	with gr.Column(variant='panel'):
	input_ = gr.Textbox(
	label="Text",
	value="Introdueix el text a sintetitzar.",
	lines=4
	)

	dataset = gr.Radio(["All", "Festcat", "Google TTS", "CommonVoice"], label="Speakers Dataset",
	value="All")


	def update_speaker_list(dataset):
	print("Updating speaker list based on dataset:", dataset)
	if dataset == "Festcat":
	current_speakers = festcat_speakers
	elif dataset == "Google TTS":
	current_speakers = google_speakers
	elif dataset == "CommonVoice":
	current_speakers = commonvoice_speakers
	else:
	current_speakers = speakers_list

	return gr.update(choices=current_speakers, value=current_speakers[0])


	speaker_id = gr.Dropdown(label="Select a voice", choices=speakers_list, value=DEFAULT_SPEAKER_ID,
	interactive=True)
	dataset.change(fn=update_speaker_list, inputs=dataset, outputs=speaker_id)

	# model = gr.Dropdown(label="Select a model", choices=model_files, value=DEFAULT_MODEL_FILE_NAME)
	with gr.Row():
	clear_btn = gr.ClearButton(value='Clean', components=[input_])
	# clear_btn = gr.Button(
	# "Clean",
	# )
	submit_btn = gr.Button(
	"Submit",
	variant="primary",
	)
	use_denoise = gr.Radio(choices=[("Yes", 0), ("No", 1)], value=0)
	with gr.Column(variant='panel'):
	output_audio = gr.Audio(label="Output", type="filepath", autoplay=True, show_share_button=False)
	output_audio_den = gr.Audio(label="Output denoised", type="filepath", autoplay=False,
	show_share_button=False)

	output_phonetic = gr.Textbox(label="Phonetic Transcription", readonly=True)

	for button in [submit_btn]: # clear_btn
	input_.change(fn=change_interactive, inputs=[input_], outputs=button)

	# clear_btn.click(fn=clean, inputs=[], outputs=[input_, output_audio, output_phonetic], queue=False)
	submit_btn.click(fn=submit_input, inputs=[input_, speaker_id, use_denoise], outputs=[output_audio,
	output_audio_den,
	output_phonetic])

	app.queue(concurrency_count=1, api_open=False)
	app.launch(show_api=False, server_name="0.0.0.0", server_port=7860)