Spaces:

VoiceCloning-be
/

Applio-Full-ZeroGPU

Runtime error

App Files Files Community

Applio-Full-ZeroGPU / core.py

VoiceCloning-be

Update core.py

67db312 verified 5 months ago

raw

history blame

54 kB

	import os
	import sys
	import json
	import argparse
	import subprocess
	from functools import lru_cache
	from distutils.util import strtobool

	now_dir = os.getcwd()
	sys.path.append(now_dir)

	current_script_directory = os.path.dirname(os.path.realpath(__file__))
	logs_path = os.path.join(current_script_directory, "logs")

	from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
	from rvc.train.process.model_blender import model_blender
	from rvc.train.process.model_information import model_information
	from rvc.train.process.extract_small_model import extract_small_model
	from rvc.lib.tools.analyzer import analyze_audio
	from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
	from rvc.lib.tools.model_download import model_download_pipeline

	python = sys.executable


	# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
	@lru_cache(maxsize=1) # Cache only one result since the file is static
	def load_voices_data():
	with open(os.path.join("rvc", "lib", "tools", "tts_voices.json")) as f:
	return json.load(f)


	voices_data = load_voices_data()
	locales = list({voice["Locale"] for voice in voices_data})


	@lru_cache(maxsize=None)
	def import_voice_converter():
	from rvc.infer.infer import VoiceConverter

	return VoiceConverter()


	@lru_cache(maxsize=1)
	def get_config():
	from rvc.configs.config import Config

	return Config()


	# Infer
	def run_infer_script(
	pitch: int,
	filter_radius: int,
	index_rate: float,
	volume_envelope: int,
	protect: float,
	hop_length: int,
	f0_method: str,
	input_path: str,
	output_path: str,
	pth_path: str,
	index_path: str,
	split_audio: bool,
	f0_autotune: bool,
	clean_audio: bool,
	clean_strength: float,
	export_format: str,
	upscale_audio: bool,
	f0_file: str,
	embedder_model: str,
	embedder_model_custom: str = None,
	formant_shifting: bool = False,
	formant_qfrency: float = 1.0,
	formant_timbre: float = 1.0,
	post_process: bool = False,
	reverb: bool = False,
	pitch_shift: bool = False,
	limiter: bool = False,
	gain: bool = False,
	distortion: bool = False,
	chorus: bool = False,
	bitcrush: bool = False,
	clipping: bool = False,
	compressor: bool = False,
	delay: bool = False,
	*sliders: list,
	):
	if not sliders:
	sliders = [0] * 25
	infer_pipeline = import_voice_converter()
	additional_params = {
	"reverb_room_size": sliders[0],
	"reverb_damping": sliders[1],
	"reverb_wet_level": sliders[2],
	"reverb_dry_level": sliders[3],
	"reverb_width": sliders[4],
	"reverb_freeze_mode": sliders[5],
	"pitch_shift_semitones": sliders[6],
	"limiter_threshold": sliders[7],
	"limiter_release": sliders[8],
	"gain_db": sliders[9],
	"distortion_gain": sliders[10],
	"chorus_rate": sliders[11],
	"chorus_depth": sliders[12],
	"chorus_delay": sliders[13],
	"chorus_feedback": sliders[14],
	"chorus_mix": sliders[15],
	"bitcrush_bit_depth": sliders[16],
	"clipping_threshold": sliders[17],
	"compressor_threshold": sliders[18],
	"compressor_ratio": sliders[19],
	"compressor_attack": sliders[20],
	"compressor_release": sliders[21],
	"delay_seconds": sliders[22],
	"delay_feedback": sliders[23],
	"delay_mix": sliders[24],
	}
	infer_pipeline.convert_audio(
	pitch=pitch,
	filter_radius=filter_radius,
	index_rate=index_rate,
	volume_envelope=volume_envelope,
	protect=protect,
	hop_length=hop_length,
	f0_method=f0_method,
	audio_input_path=input_path,
	audio_output_path=output_path,
	model_path=pth_path,
	index_path=index_path,
	split_audio=split_audio,
	f0_autotune=f0_autotune,
	clean_audio=clean_audio,
	clean_strength=clean_strength,
	export_format=export_format,
	upscale_audio=upscale_audio,
	f0_file=f0_file,
	embedder_model=embedder_model,
	embedder_model_custom=embedder_model_custom,
	formant_shifting=formant_shifting,
	formant_qfrency=formant_qfrency,
	formant_timbre=formant_timbre,
	post_process=post_process,
	reverb=reverb,
	pitch_shift=pitch_shift,
	limiter=limiter,
	gain=gain,
	distortion=distortion,
	chorus=chorus,
	bitcrush=bitcrush,
	clipping=clipping,
	compressor=compressor,
	delay=delay,
	sliders=additional_params,
	)
	return f"File {input_path} inferred successfully.", output_path.replace(
	".wav", f".{export_format.lower()}"
	)


	# Batch infer
	def run_batch_infer_script(
	pitch: int,
	filter_radius: int,
	index_rate: float,
	volume_envelope: int,
	protect: float,
	hop_length: int,
	f0_method: str,
	input_folder: str,
	output_folder: str,
	pth_path: str,
	index_path: str,
	split_audio: bool,
	f0_autotune: bool,
	clean_audio: bool,
	clean_strength: float,
	export_format: str,
	upscale_audio: bool,
	f0_file: str,
	embedder_model: str,
	embedder_model_custom: str = None,
	formant_shifting: bool = False,
	formant_qfrency: float = 1.0,
	formant_timbre: float = 1.0,
	post_process: bool = False,
	reverb: bool = False,
	pitch_shift: bool = False,
	limiter: bool = False,
	gain: bool = False,
	distortion: bool = False,
	chorus: bool = False,
	bitcrush: bool = False,
	clipping: bool = False,
	compressor: bool = False,
	delay: bool = False,
	*sliders: list,
	):
	audio_files = [
	f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
	]
	print(f"Detected {len(audio_files)} audio files for inference.")
	if not sliders:
	sliders = [0] * 25
	infer_pipeline = import_voice_converter()
	additional_params = {
	"reverb_room_size": sliders[0],
	"reverb_damping": sliders[1],
	"reverb_wet_level": sliders[2],
	"reverb_dry_level": sliders[3],
	"reverb_width": sliders[4],
	"reverb_freeze_mode": sliders[5],
	"pitch_shift_semitones": sliders[6],
	"limiter_threshold": sliders[7],
	"limiter_release": sliders[8],
	"gain_db": sliders[9],
	"distortion_gain": sliders[10],
	"chorus_rate": sliders[11],
	"chorus_depth": sliders[12],
	"chorus_delay": sliders[13],
	"chorus_feedback": sliders[14],
	"chorus_mix": sliders[15],
	"bitcrush_bit_depth": sliders[16],
	"clipping_threshold": sliders[17],
	"compressor_threshold": sliders[18],
	"compressor_ratio": sliders[19],
	"compressor_attack": sliders[20],
	"compressor_release": sliders[21],
	"delay_seconds": sliders[22],
	"delay_feedback": sliders[23],
	"delay_mix": sliders[24],
	}
	infer_pipeline.convert_audio_batch(
	pitch=pitch,
	filter_radius=filter_radius,
	index_rate=index_rate,
	volume_envelope=volume_envelope,
	protect=protect,
	hop_length=hop_length,
	f0_method=f0_method,
	audio_input_paths=input_folder,
	audio_output_path=output_folder,
	model_path=pth_path,
	index_path=index_path,
	split_audio=split_audio,
	f0_autotune=f0_autotune,
	clean_audio=clean_audio,
	clean_strength=clean_strength,
	export_format=export_format,
	upscale_audio=upscale_audio,
	f0_file=f0_file,
	embedder_model=embedder_model,
	embedder_model_custom=embedder_model_custom,
	formant_shifting=formant_shifting,
	formant_qfrency=formant_qfrency,
	formant_timbre=formant_timbre,
	pid_file_path=os.path.join(now_dir, "assets", "infer_pid.txt"),
	post_process=post_process,
	reverb=reverb,
	pitch_shift=pitch_shift,
	limiter=limiter,
	gain=gain,
	distortion=distortion,
	chorus=chorus,
	bitcrush=bitcrush,
	clipping=clipping,
	compressor=compressor,
	delay=delay,
	sliders=additional_params,
	)

	return f"Files from {input_folder} inferred successfully."


	# TTS
	def run_tts_script(
	tts_text: str,
	tts_voice: str,
	tts_rate: int,
	pitch: int,
	filter_radius: int,
	index_rate: float,
	volume_envelope: int,
	protect: float,
	hop_length: int,
	f0_method: str,
	output_tts_path: str,
	output_rvc_path: str,
	pth_path: str,
	index_path: str,
	split_audio: bool,
	f0_autotune: bool,
	clean_audio: bool,
	clean_strength: float,
	export_format: str,
	upscale_audio: bool,
	f0_file: str,
	embedder_model: str,
	embedder_model_custom: str = None,
	):

	tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")

	if os.path.exists(output_tts_path):
	os.remove(output_tts_path)

	command_tts = [
	*map(
	str,
	[
	python,
	tts_script_path,
	tts_text,
	tts_voice,
	tts_rate,
	output_tts_path,
	],
	),
	]
	subprocess.run(command_tts)
	infer_pipeline = import_voice_converter()
	infer_pipeline.convert_audio(
	pitch=pitch,
	filter_radius=filter_radius,
	index_rate=index_rate,
	volume_envelope=volume_envelope,
	protect=protect,
	hop_length=hop_length,
	f0_method=f0_method,
	audio_input_path=output_tts_path,
	audio_output_path=output_rvc_path,
	model_path=pth_path,
	index_path=index_path,
	split_audio=split_audio,
	f0_autotune=f0_autotune,
	clean_audio=clean_audio,
	clean_strength=clean_strength,
	export_format=export_format,
	upscale_audio=upscale_audio,
	f0_file=f0_file,
	embedder_model=embedder_model,
	embedder_model_custom=embedder_model_custom,
	formant_shifting=None,
	formant_qfrency=None,
	formant_timbre=None,
	post_process=None,
	reverb=None,
	pitch_shift=None,
	limiter=None,
	gain=None,
	distortion=None,
	chorus=None,
	bitcrush=None,
	clipping=None,
	compressor=None,
	delay=None,
	sliders=None,
	)

	return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
	".wav", f".{export_format.lower()}"
	)


	# Preprocess
	def run_preprocess_script(
	model_name: str,
	dataset_path: str,
	sample_rate: int,
	cpu_cores: int,
	cut_preprocess: bool,
	process_effects: bool,
	):
	config = get_config()
	per = 3.0 if config.is_half else 3.7
	preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
	command = [
	python,
	preprocess_script_path,
	*map(
	str,
	[
	os.path.join(logs_path, model_name),
	dataset_path,
	sample_rate,
	per,
	cpu_cores,
	cut_preprocess,
	process_effects,
	],
	),
	]
	subprocess.run(command)
	return f"Model {model_name} preprocessed successfully."


	# Extract
	def run_extract_script(
	model_name: str,
	rvc_version: str,
	f0_method: str,
	pitch_guidance: bool,
	hop_length: int,
	cpu_cores: int,
	gpu: int,
	sample_rate: int,
	embedder_model: str,
	embedder_model_custom: str = None,
	):

	model_path = os.path.join(logs_path, model_name)
	extract = os.path.join("rvc", "train", "extract", "extract.py")

	command_1 = [
	python,
	extract,
	*map(
	str,
	[
	model_path,
	f0_method,
	hop_length,
	cpu_cores,
	gpu,
	rvc_version,
	pitch_guidance,
	sample_rate,
	embedder_model,
	embedder_model_custom,
	],
	),
	]

	subprocess.run(command_1)

	return f"Model {model_name} extracted successfully."


	# Train
	@spaces.GPU(duration=450)
	def run_train_script(
	model_name: str,
	rvc_version: str,
	save_every_epoch: int,
	save_only_latest: bool,
	save_every_weights: bool,
	total_epoch: int,
	sample_rate: int,
	batch_size: int,
	gpu: int,
	pitch_guidance: bool,
	overtraining_detector: bool,
	overtraining_threshold: int,
	pretrained: bool,
	sync_graph: bool,
	index_algorithm: str = "Auto",
	cache_data_in_gpu: bool = False,
	custom_pretrained: bool = False,
	g_pretrained_path: str = None,
	d_pretrained_path: str = None,
	):

	if pretrained == True:
	from rvc.lib.tools.pretrained_selector import pretrained_selector

	if custom_pretrained == False:
	pg, pd = pretrained_selector(bool(pitch_guidance))[str(rvc_version)][
	int(sample_rate)
	]
	else:
	if g_pretrained_path is None or d_pretrained_path is None:
	raise ValueError(
	"Please provide the path to the pretrained G and D models."
	)
	pg, pd = g_pretrained_path, d_pretrained_path
	else:
	pg, pd = "", ""

	train_script_path = os.path.join("rvc", "train", "train.py")
	command = [
	python,
	train_script_path,
	*map(
	str,
	[
	model_name,
	save_every_epoch,
	total_epoch,
	pg,
	pd,
	rvc_version,
	gpu,
	batch_size,
	sample_rate,
	pitch_guidance,
	save_only_latest,
	save_every_weights,
	cache_data_in_gpu,
	overtraining_detector,
	overtraining_threshold,
	sync_graph,
	],
	),
	]
	subprocess.run(command)
	run_index_script(model_name, rvc_version, index_algorithm)
	return f"Model {model_name} trained successfully."


	# Index
	def run_index_script(model_name: str, rvc_version: str, index_algorithm: str):
	index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
	command = [
	python,
	index_script_path,
	os.path.join(logs_path, model_name),
	rvc_version,
	index_algorithm,
	]

	subprocess.run(command)
	return f"Index file for {model_name} generated successfully."


	# Model extract
	def run_model_extract_script(
	pth_path: str,
	model_name: str,
	sample_rate: int,
	pitch_guidance: bool,
	rvc_version: str,
	epoch: int,
	step: int,
	):
	extract_small_model(
	pth_path, model_name, sample_rate, pitch_guidance, rvc_version, epoch, step
	)
	return f"Model {model_name} extracted successfully."


	# Model information
	def run_model_information_script(pth_path: str):
	print(model_information(pth_path))
	return model_information(pth_path)


	# Model blender
	def run_model_blender_script(
	model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
	):
	message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
	return message, model_blended


	# Tensorboard
	def run_tensorboard_script():
	launch_tensorboard_pipeline()


	# Download
	def run_download_script(model_link: str):
	model_download_pipeline(model_link)
	return f"Model downloaded successfully."


	# Prerequisites
	def run_prerequisites_script(
	pretraineds_v1: bool, pretraineds_v2: bool, models: bool, exe: bool
	):
	prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
	return "Prerequisites installed successfully."


	# Audio analyzer
	def run_audio_analyzer_script(
	input_path: str, save_plot_path: str = "logs/audio_analysis.png"
	):
	audio_info, plot_path = analyze_audio(input_path, save_plot_path)
	print(
	f"Audio info of {input_path}: {audio_info}",
	f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
	)
	return audio_info, plot_path


	def run_model_author_script(model_author: str):
	with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
	config = json.load(f)

	config["model_author"] = model_author

	with open(os.path.join(now_dir, "assets", "config.json"), "w") as f:
	json.dump(config, f, indent=4)

	print(f"Model author set to {model_author}.")
	return f"Model author set to {model_author}."


	# API
	def run_api_script(ip: str, port: int):
	command = [
	"env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
	"api:app",
	"--host",
	ip,
	"--port",
	port,
	]
	subprocess.run(command)


	# Parse arguments
	def parse_arguments():
	parser = argparse.ArgumentParser(
	description="Run the main.py script with specific parameters."
	)
	subparsers = parser.add_subparsers(
	title="subcommands", dest="mode", help="Choose a mode"
	)

	# Parser for 'infer' mode
	infer_parser = subparsers.add_parser("infer", help="Run inference")
	pitch_description = (
	"Set the pitch of the audio. Higher values result in a higher pitch."
	)
	infer_parser.add_argument(
	"--pitch",
	type=int,
	help=pitch_description,
	choices=range(-24, 25),
	default=0,
	)
	filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
	infer_parser.add_argument(
	"--filter_radius",
	type=int,
	help=filter_radius_description,
	choices=range(11),
	default=3,
	)
	index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
	infer_parser.add_argument(
	"--index_rate",
	type=float,
	help=index_rate_description,
	choices=[(i / 10) for i in range(11)],
	default=0.3,
	)
	volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
	infer_parser.add_argument(
	"--volume_envelope",
	type=float,
	help=volume_envelope_description,
	choices=[(i / 10) for i in range(11)],
	default=1,
	)
	protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
	infer_parser.add_argument(
	"--protect",
	type=float,
	help=protect_description,
	choices=[(i / 10) for i in range(6)],
	default=0.33,
	)
	hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
	infer_parser.add_argument(
	"--hop_length",
	type=int,
	help=hop_length_description,
	choices=range(1, 513),
	default=128,
	)
	f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
	infer_parser.add_argument(
	"--f0_method",
	type=str,
	help=f0_method_description,
	choices=[
	"crepe",
	"crepe-tiny",
	"rmvpe",
	"fcpe",
	"hybrid[crepe+rmvpe]",
	"hybrid[crepe+fcpe]",
	"hybrid[rmvpe+fcpe]",
	"hybrid[crepe+rmvpe+fcpe]",
	],
	default="rmvpe",
	)
	infer_parser.add_argument(
	"--input_path",
	type=str,
	help="Full path to the input audio file.",
	required=True,
	)
	infer_parser.add_argument(
	"--output_path",
	type=str,
	help="Full path to the output audio file.",
	required=True,
	)
	pth_path_description = "Full path to the RVC model file (.pth)."
	infer_parser.add_argument(
	"--pth_path", type=str, help=pth_path_description, required=True
	)
	index_path_description = "Full path to the index file (.index)."
	infer_parser.add_argument(
	"--index_path", type=str, help=index_path_description, required=True
	)
	split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
	infer_parser.add_argument(
	"--split_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=split_audio_description,
	default=False,
	)
	f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
	infer_parser.add_argument(
	"--f0_autotune",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=f0_autotune_description,
	default=False,
	)
	clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
	infer_parser.add_argument(
	"--clean_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=clean_audio_description,
	default=False,
	)
	clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
	infer_parser.add_argument(
	"--clean_strength",
	type=float,
	help=clean_strength_description,
	choices=[(i / 10) for i in range(11)],
	default=0.7,
	)
	export_format_description = "Select the desired output audio format."
	infer_parser.add_argument(
	"--export_format",
	type=str,
	help=export_format_description,
	choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
	default="WAV",
	)
	embedder_model_description = (
	"Choose the model used for generating speaker embeddings."
	)
	infer_parser.add_argument(
	"--embedder_model",
	type=str,
	help=embedder_model_description,
	choices=[
	"contentvec",
	"chinese-hubert-base",
	"japanese-hubert-base",
	"korean-hubert-base",
	"custom",
	],
	default="contentvec",
	)
	embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
	infer_parser.add_argument(
	"--embedder_model_custom",
	type=str,
	help=embedder_model_custom_description,
	default=None,
	)
	upscale_audio_description = "Upscale the input audio to a higher quality before processing. This can improve the overall quality of the output, especially for low-quality input audio."
	infer_parser.add_argument(
	"--upscale_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=upscale_audio_description,
	default=False,
	)
	f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
	infer_parser.add_argument(
	"--f0_file",
	type=str,
	help=f0_file_description,
	default=None,
	)
	formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
	infer_parser.add_argument(
	"--formant_shifting",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=formant_shifting_description,
	default=False,
	required=False,
	)
	formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
	infer_parser.add_argument(
	"--formant_qfrency",
	type=float,
	help=formant_qfrency_description,
	default=1.0,
	required=False,
	)
	formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
	infer_parser.add_argument(
	"--formant_timbre",
	type=float,
	help=formant_timbre_description,
	default=1.0,
	required=False,
	)

	# Parser for 'batch_infer' mode
	batch_infer_parser = subparsers.add_parser(
	"batch_infer",
	help="Run batch inference",
	)
	batch_infer_parser.add_argument(
	"--pitch",
	type=int,
	help=pitch_description,
	choices=range(-24, 25),
	default=0,
	)
	batch_infer_parser.add_argument(
	"--filter_radius",
	type=int,
	help=filter_radius_description,
	choices=range(11),
	default=3,
	)
	batch_infer_parser.add_argument(
	"--index_rate",
	type=float,
	help=index_rate_description,
	choices=[(i / 10) for i in range(11)],
	default=0.3,
	)
	batch_infer_parser.add_argument(
	"--volume_envelope",
	type=float,
	help=volume_envelope_description,
	choices=[(i / 10) for i in range(11)],
	default=1,
	)
	batch_infer_parser.add_argument(
	"--protect",
	type=float,
	help=protect_description,
	choices=[(i / 10) for i in range(6)],
	default=0.33,
	)
	batch_infer_parser.add_argument(
	"--hop_length",
	type=int,
	help=hop_length_description,
	choices=range(1, 513),
	default=128,
	)
	batch_infer_parser.add_argument(
	"--f0_method",
	type=str,
	help=f0_method_description,
	choices=[
	"crepe",
	"crepe-tiny",
	"rmvpe",
	"fcpe",
	"hybrid[crepe+rmvpe]",
	"hybrid[crepe+fcpe]",
	"hybrid[rmvpe+fcpe]",
	"hybrid[crepe+rmvpe+fcpe]",
	],
	default="rmvpe",
	)
	batch_infer_parser.add_argument(
	"--input_folder",
	type=str,
	help="Path to the folder containing input audio files.",
	required=True,
	)
	batch_infer_parser.add_argument(
	"--output_folder",
	type=str,
	help="Path to the folder for saving output audio files.",
	required=True,
	)
	batch_infer_parser.add_argument(
	"--pth_path", type=str, help=pth_path_description, required=True
	)
	batch_infer_parser.add_argument(
	"--index_path", type=str, help=index_path_description, required=True
	)
	batch_infer_parser.add_argument(
	"--split_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=split_audio_description,
	default=False,
	)
	batch_infer_parser.add_argument(
	"--f0_autotune",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=f0_autotune_description,
	default=False,
	)
	batch_infer_parser.add_argument(
	"--clean_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=clean_audio_description,
	default=False,
	)
	batch_infer_parser.add_argument(
	"--clean_strength",
	type=float,
	help=clean_strength_description,
	choices=[(i / 10) for i in range(11)],
	default=0.7,
	)
	batch_infer_parser.add_argument(
	"--export_format",
	type=str,
	help=export_format_description,
	choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
	default="WAV",
	)
	batch_infer_parser.add_argument(
	"--embedder_model",
	type=str,
	help=embedder_model_description,
	choices=[
	"contentvec",
	"chinese-hubert-base",
	"japanese-hubert-base",
	"korean-hubert-base",
	"custom",
	],
	default="contentvec",
	)
	batch_infer_parser.add_argument(
	"--embedder_model_custom",
	type=str,
	help=embedder_model_custom_description,
	default=None,
	)
	batch_infer_parser.add_argument(
	"--upscale_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=upscale_audio_description,
	default=False,
	)
	batch_infer_parser.add_argument(
	"--f0_file",
	type=str,
	help=f0_file_description,
	default=None,
	)
	batch_infer_parser.add_argument(
	"--formant_shifting",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=formant_shifting_description,
	default=False,
	required=False,
	)
	batch_infer_parser.add_argument(
	"--formant_qfrency",
	type=float,
	help=formant_qfrency_description,
	default=1.0,
	required=False,
	)
	batch_infer_parser.add_argument(
	"--formant_timbre",
	type=float,
	help=formant_timbre_description,
	default=1.0,
	required=False,
	)

	# Parser for 'tts' mode
	tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
	tts_parser.add_argument(
	"--tts_text", type=str, help="Text to be synthesized", required=True
	)
	tts_parser.add_argument(
	"--tts_voice",
	type=str,
	help="Voice to be used for TTS synthesis.",
	choices=locales,
	required=True,
	)
	tts_parser.add_argument(
	"--tts_rate",
	type=int,
	help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
	choices=range(-100, 101),
	default=0,
	)
	tts_parser.add_argument(
	"--pitch",
	type=int,
	help=pitch_description,
	choices=range(-24, 25),
	default=0,
	)
	tts_parser.add_argument(
	"--filter_radius",
	type=int,
	help=filter_radius_description,
	choices=range(11),
	default=3,
	)
	tts_parser.add_argument(
	"--index_rate",
	type=float,
	help=index_rate_description,
	choices=[(i / 10) for i in range(11)],
	default=0.3,
	)
	tts_parser.add_argument(
	"--volume_envelope",
	type=float,
	help=volume_envelope_description,
	choices=[(i / 10) for i in range(11)],
	default=1,
	)
	tts_parser.add_argument(
	"--protect",
	type=float,
	help=protect_description,
	choices=[(i / 10) for i in range(6)],
	default=0.33,
	)
	tts_parser.add_argument(
	"--hop_length",
	type=int,
	help=hop_length_description,
	choices=range(1, 513),
	default=128,
	)
	tts_parser.add_argument(
	"--f0_method",
	type=str,
	help=f0_method_description,
	choices=[
	"crepe",
	"crepe-tiny",
	"rmvpe",
	"fcpe",
	"hybrid[crepe+rmvpe]",
	"hybrid[crepe+fcpe]",
	"hybrid[rmvpe+fcpe]",
	"hybrid[crepe+rmvpe+fcpe]",
	],
	default="rmvpe",
	)
	tts_parser.add_argument(
	"--output_tts_path",
	type=str,
	help="Full path to save the synthesized TTS audio.",
	required=True,
	)
	tts_parser.add_argument(
	"--output_rvc_path",
	type=str,
	help="Full path to save the voice-converted audio using the synthesized TTS.",
	required=True,
	)
	tts_parser.add_argument(
	"--pth_path", type=str, help=pth_path_description, required=True
	)
	tts_parser.add_argument(
	"--index_path", type=str, help=index_path_description, required=True
	)
	tts_parser.add_argument(
	"--split_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=split_audio_description,
	default=False,
	)
	tts_parser.add_argument(
	"--f0_autotune",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=f0_autotune_description,
	default=False,
	)
	tts_parser.add_argument(
	"--clean_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=clean_audio_description,
	default=False,
	)
	tts_parser.add_argument(
	"--clean_strength",
	type=float,
	help=clean_strength_description,
	choices=[(i / 10) for i in range(11)],
	default=0.7,
	)
	tts_parser.add_argument(
	"--export_format",
	type=str,
	help=export_format_description,
	choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
	default="WAV",
	)
	tts_parser.add_argument(
	"--embedder_model",
	type=str,
	help=embedder_model_description,
	choices=[
	"contentvec",
	"chinese-hubert-base",
	"japanese-hubert-base",
	"korean-hubert-base",
	"custom",
	],
	default="contentvec",
	)
	tts_parser.add_argument(
	"--embedder_model_custom",
	type=str,
	help=embedder_model_custom_description,
	default=None,
	)
	tts_parser.add_argument(
	"--upscale_audio",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help=upscale_audio_description,
	default=False,
	)
	tts_parser.add_argument(
	"--f0_file",
	type=str,
	help=f0_file_description,
	default=None,
	)

	# Parser for 'preprocess' mode
	preprocess_parser = subparsers.add_parser(
	"preprocess", help="Preprocess a dataset for training."
	)
	preprocess_parser.add_argument(
	"--model_name", type=str, help="Name of the model to be trained.", required=True
	)
	preprocess_parser.add_argument(
	"--dataset_path", type=str, help="Path to the dataset directory.", required=True
	)
	preprocess_parser.add_argument(
	"--sample_rate",
	type=int,
	help="Target sampling rate for the audio data.",
	choices=[32000, 40000, 48000],
	required=True,
	)
	preprocess_parser.add_argument(
	"--cpu_cores",
	type=int,
	help="Number of CPU cores to use for preprocessing.",
	choices=range(1, 65),
	)
	preprocess_parser.add_argument(
	"--cut_preprocess",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Cut the dataset into smaller segments for faster preprocessing.",
	default=True,
	required=False,
	)
	preprocess_parser.add_argument(
	"--process_effects",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Disable all filters during preprocessing.",
	default=False,
	required=False,
	)

	# Parser for 'extract' mode
	extract_parser = subparsers.add_parser(
	"extract", help="Extract features from a dataset."
	)
	extract_parser.add_argument(
	"--model_name", type=str, help="Name of the model.", required=True
	)
	extract_parser.add_argument(
	"--rvc_version",
	type=str,
	help="Version of the RVC model ('v1' or 'v2').",
	choices=["v1", "v2"],
	default="v2",
	)
	extract_parser.add_argument(
	"--f0_method",
	type=str,
	help="Pitch extraction method to use.",
	choices=[
	"crepe",
	"crepe-tiny",
	"rmvpe",
	],
	default="rmvpe",
	)
	extract_parser.add_argument(
	"--pitch_guidance",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Enable or disable pitch guidance during feature extraction.",
	default=True,
	)
	extract_parser.add_argument(
	"--hop_length",
	type=int,
	help="Hop length for feature extraction. Only applicable for Crepe pitch extraction.",
	choices=range(1, 513),
	default=128,
	)
	extract_parser.add_argument(
	"--cpu_cores",
	type=int,
	help="Number of CPU cores to use for feature extraction (optional).",
	choices=range(1, 65),
	default=None,
	)
	extract_parser.add_argument(
	"--gpu",
	type=int,
	help="GPU device to use for feature extraction (optional).",
	default="-",
	)
	extract_parser.add_argument(
	"--sample_rate",
	type=int,
	help="Target sampling rate for the audio data.",
	choices=[32000, 40000, 48000],
	required=True,
	)
	extract_parser.add_argument(
	"--embedder_model",
	type=str,
	help=embedder_model_description,
	choices=[
	"contentvec",
	"chinese-hubert-base",
	"japanese-hubert-base",
	"korean-hubert-base",
	"custom",
	],
	default="contentvec",
	)
	extract_parser.add_argument(
	"--embedder_model_custom",
	type=str,
	help=embedder_model_custom_description,
	default=None,
	)

	# Parser for 'train' mode
	train_parser = subparsers.add_parser("train", help="Train an RVC model.")
	train_parser.add_argument(
	"--model_name", type=str, help="Name of the model to be trained.", required=True
	)
	train_parser.add_argument(
	"--rvc_version",
	type=str,
	help="Version of the RVC model to train ('v1' or 'v2').",
	choices=["v1", "v2"],
	default="v2",
	)
	train_parser.add_argument(
	"--save_every_epoch",
	type=int,
	help="Save the model every specified number of epochs.",
	choices=range(1, 101),
	required=True,
	)
	train_parser.add_argument(
	"--save_only_latest",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Save only the latest model checkpoint.",
	default=False,
	)
	train_parser.add_argument(
	"--save_every_weights",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Save model weights every epoch.",
	default=True,
	)
	train_parser.add_argument(
	"--total_epoch",
	type=int,
	help="Total number of epochs to train for.",
	choices=range(1, 10001),
	default=1000,
	)
	train_parser.add_argument(
	"--sample_rate",
	type=int,
	help="Sampling rate of the training data.",
	choices=[32000, 40000, 48000],
	required=True,
	)
	train_parser.add_argument(
	"--batch_size",
	type=int,
	help="Batch size for training.",
	choices=range(1, 51),
	default=8,
	)
	train_parser.add_argument(
	"--gpu",
	type=str,
	help="GPU device to use for training (e.g., '0').",
	default="0",
	)
	train_parser.add_argument(
	"--pitch_guidance",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Enable or disable pitch guidance during training.",
	default=True,
	)
	train_parser.add_argument(
	"--pretrained",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Use a pretrained model for initialization.",
	default=True,
	)
	train_parser.add_argument(
	"--custom_pretrained",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Use a custom pretrained model.",
	default=False,
	)
	train_parser.add_argument(
	"--g_pretrained_path",
	type=str,
	nargs="?",
	default=None,
	help="Path to the pretrained generator model file.",
	)
	train_parser.add_argument(
	"--d_pretrained_path",
	type=str,
	nargs="?",
	default=None,
	help="Path to the pretrained discriminator model file.",
	)
	train_parser.add_argument(
	"--overtraining_detector",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Enable overtraining detection.",
	default=False,
	)
	train_parser.add_argument(
	"--overtraining_threshold",
	type=int,
	help="Threshold for overtraining detection.",
	choices=range(1, 101),
	default=50,
	)
	train_parser.add_argument(
	"--sync_graph",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Enable graph synchronization for distributed training.",
	default=False,
	)
	train_parser.add_argument(
	"--cache_data_in_gpu",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Cache training data in GPU memory.",
	default=False,
	)
	train_parser.add_argument(
	"--index_algorithm",
	type=str,
	choices=["Auto", "Faiss", "KMeans"],
	help="Choose the method for generating the index file.",
	default="Auto",
	required=False,
	)

	# Parser for 'index' mode
	index_parser = subparsers.add_parser(
	"index", help="Generate an index file for an RVC model."
	)
	index_parser.add_argument(
	"--model_name", type=str, help="Name of the model.", required=True
	)
	index_parser.add_argument(
	"--rvc_version",
	type=str,
	help="Version of the RVC model ('v1' or 'v2').",
	choices=["v1", "v2"],
	default="v2",
	)
	index_parser.add_argument(
	"--index_algorithm",
	type=str,
	choices=["Auto", "Faiss", "KMeans"],
	help="Choose the method for generating the index file.",
	default="Auto",
	required=False,
	)

	# Parser for 'model_extract' mode
	model_extract_parser = subparsers.add_parser(
	"model_extract", help="Extract a specific epoch from a trained model."
	)
	model_extract_parser.add_argument(
	"--pth_path", type=str, help="Path to the main .pth model file.", required=True
	)
	model_extract_parser.add_argument(
	"--model_name", type=str, help="Name of the model.", required=True
	)
	model_extract_parser.add_argument(
	"--sample_rate",
	type=int,
	help="Sampling rate of the extracted model.",
	choices=[32000, 40000, 48000],
	required=True,
	)
	model_extract_parser.add_argument(
	"--pitch_guidance",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	help="Enable or disable pitch guidance for the extracted model.",
	required=True,
	)
	model_extract_parser.add_argument(
	"--rvc_version",
	type=str,
	help="Version of the extracted RVC model ('v1' or 'v2').",
	choices=["v1", "v2"],
	default="v2",
	)
	model_extract_parser.add_argument(
	"--epoch",
	type=int,
	help="Epoch number to extract from the model.",
	choices=range(1, 10001),
	required=True,
	)
	model_extract_parser.add_argument(
	"--step",
	type=str,
	help="Step number to extract from the model (optional).",
	required=False,
	)

	# Parser for 'model_information' mode
	model_information_parser = subparsers.add_parser(
	"model_information", help="Display information about a trained model."
	)
	model_information_parser.add_argument(
	"--pth_path", type=str, help="Path to the .pth model file.", required=True
	)

	# Parser for 'model_blender' mode
	model_blender_parser = subparsers.add_parser(
	"model_blender", help="Fuse two RVC models together."
	)
	model_blender_parser.add_argument(
	"--model_name", type=str, help="Name of the new fused model.", required=True
	)
	model_blender_parser.add_argument(
	"--pth_path_1",
	type=str,
	help="Path to the first .pth model file.",
	required=True,
	)
	model_blender_parser.add_argument(
	"--pth_path_2",
	type=str,
	help="Path to the second .pth model file.",
	required=True,
	)
	model_blender_parser.add_argument(
	"--ratio",
	type=float,
	help="Ratio for blending the two models (0.0 to 1.0).",
	choices=[(i / 10) for i in range(11)],
	default=0.5,
	)

	# Parser for 'tensorboard' mode
	subparsers.add_parser(
	"tensorboard", help="Launch TensorBoard for monitoring training progress."
	)

	# Parser for 'download' mode
	download_parser = subparsers.add_parser(
	"download", help="Download a model from a provided link."
	)
	download_parser.add_argument(
	"--model_link", type=str, help="Direct link to the model file.", required=True
	)

	# Parser for 'prerequisites' mode
	prerequisites_parser = subparsers.add_parser(
	"prerequisites", help="Install prerequisites for RVC."
	)
	prerequisites_parser.add_argument(
	"--pretraineds_v1",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	default=True,
	help="Download pretrained models for RVC v1.",
	)
	prerequisites_parser.add_argument(
	"--pretraineds_v2",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	default=True,
	help="Download pretrained models for RVC v2.",
	)
	prerequisites_parser.add_argument(
	"--models",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	default=True,
	help="Download additional models.",
	)
	prerequisites_parser.add_argument(
	"--exe",
	type=lambda x: bool(strtobool(x)),
	choices=[True, False],
	default=True,
	help="Download required executables.",
	)

	# Parser for 'audio_analyzer' mode
	audio_analyzer = subparsers.add_parser(
	"audio_analyzer", help="Analyze an audio file."
	)
	audio_analyzer.add_argument(
	"--input_path", type=str, help="Path to the input audio file.", required=True
	)

	# Parser for 'api' mode
	api_parser = subparsers.add_parser("api", help="Start the RVC API server.")
	api_parser.add_argument(
	"--host", type=str, help="Host address for the API server.", default="127.0.0.1"
	)
	api_parser.add_argument(
	"--port", type=int, help="Port for the API server.", default=8000
	)

	return parser.parse_args()


	def main():
	if len(sys.argv) == 1:
	print("Please run the script with '-h' for more information.")
	sys.exit(1)

	args = parse_arguments()

	try:
	if args.mode == "infer":
	run_infer_script(
	pitch=args.pitch,
	filter_radius=args.filter_radius,
	index_rate=args.index_rate,
	volume_envelope=args.volume_envelope,
	protect=args.protect,
	hop_length=args.hop_length,
	f0_method=args.f0_method,
	input_path=args.input_path,
	output_path=args.output_path,
	pth_path=args.pth_path,
	index_path=args.index_path,
	split_audio=args.split_audio,
	f0_autotune=args.f0_autotune,
	clean_audio=args.clean_audio,
	clean_strength=args.clean_strength,
	export_format=args.export_format,
	embedder_model=args.embedder_model,
	embedder_model_custom=args.embedder_model_custom,
	upscale_audio=args.upscale_audio,
	f0_file=args.f0_file,
	)
	elif args.mode == "batch_infer":
	run_batch_infer_script(
	pitch=args.pitch,
	filter_radius=args.filter_radius,
	index_rate=args.index_rate,
	volume_envelope=args.volume_envelope,
	protect=args.protect,
	hop_length=args.hop_length,
	f0_method=args.f0_method,
	input_folder=args.input_folder,
	output_folder=args.output_folder,
	pth_path=args.pth_path,
	index_path=args.index_path,
	split_audio=args.split_audio,
	f0_autotune=args.f0_autotune,
	clean_audio=args.clean_audio,
	clean_strength=args.clean_strength,
	export_format=args.export_format,
	embedder_model=args.embedder_model,
	embedder_model_custom=args.embedder_model_custom,
	upscale_audio=args.upscale_audio,
	f0_file=args.f0_file,
	)
	elif args.mode == "tts":
	run_tts_script(
	tts_text=args.tts_text,
	tts_voice=args.tts_voice,
	tts_rate=args.tts_rate,
	pitch=args.pitch,
	filter_radius=args.filter_radius,
	index_rate=args.index_rate,
	volume_envelope=args.volume_envelope,
	protect=args.protect,
	hop_length=args.hop_length,
	f0_method=args.f0_method,
	input_path=args.input_path,
	output_path=args.output_path,
	pth_path=args.pth_path,
	index_path=args.index_path,
	split_audio=args.split_audio,
	f0_autotune=args.f0_autotune,
	clean_audio=args.clean_audio,
	clean_strength=args.clean_strength,
	export_format=args.export_format,
	embedder_model=args.embedder_model,
	embedder_model_custom=args.embedder_model_custom,
	upscale_audio=args.upscale_audio,
	f0_file=args.f0_file,
	)
	elif args.mode == "preprocess":
	run_preprocess_script(
	model_name=args.model_name,
	dataset_path=args.dataset_path,
	sample_rate=args.sample_rate,
	cpu_cores=args.cpu_cores,
	cut_preprocess=args.cut_preprocess,
	process_effects=args.process_effects,
	)
	elif args.mode == "extract":
	run_extract_script(
	model_name=args.model_name,
	rvc_version=args.rvc_version,
	f0_method=args.f0_method,
	pitch_guidance=args.pitch_guidance,
	hop_length=args.hop_length,
	cpu_cores=args.cpu_cores,
	gpu=args.gpu,
	sample_rate=args.sample_rate,
	embedder_model=args.embedder_model,
	embedder_model_custom=args.embedder_model_custom,
	)
	elif args.mode == "train":
	run_train_script(
	model_name=args.model_name,
	rvc_version=args.rvc_version,
	save_every_epoch=args.save_every_epoch,
	save_only_latest=args.save_only_latest,
	save_every_weights=args.save_every_weights,
	total_epoch=args.total_epoch,
	sample_rate=args.sample_rate,
	batch_size=args.batch_size,
	gpu=args.gpu,
	pitch_guidance=args.pitch_guidance,
	overtraining_detector=args.overtraining_detector,
	overtraining_threshold=args.overtraining_threshold,
	pretrained=args.pretrained,
	custom_pretrained=args.custom_pretrained,
	sync_graph=args.sync_graph,
	index_algorithm=args.index_algorithm,
	cache_data_in_gpu=args.cache_data_in_gpu,
	g_pretrained_path=args.g_pretrained_path,
	d_pretrained_path=args.d_pretrained_path,
	)
	elif args.mode == "index":
	run_index_script(
	model_name=args.model_name,
	rvc_version=args.rvc_version,
	index_algorithm=args.index_algorithm,
	)
	elif args.mode == "model_extract":
	run_model_extract_script(
	pth_path=args.pth_path,
	model_name=args.model_name,
	sample_rate=args.sample_rate,
	pitch_guidance=args.pitch_guidance,
	rvc_version=args.rvc_version,
	epoch=args.epoch,
	step=args.step,
	)
	elif args.mode == "model_information":
	run_model_information_script(
	pth_path=args.pth_path,
	)
	elif args.mode == "model_blender":
	run_model_blender_script(
	model_name=args.model_name,
	pth_path_1=args.pth_path_1,
	pth_path_2=args.pth_path_2,
	ratio=args.ratio,
	)
	elif args.mode == "tensorboard":
	run_tensorboard_script()
	elif args.mode == "download":
	run_download_script(
	model_link=args.model_link,
	)
	elif args.mode == "prerequisites":
	run_prerequisites_script(
	pretraineds_v1=args.pretraineds_v1,
	pretraineds_v2=args.pretraineds_v2,
	models=args.models,
	exe=args.exe,
	)
	elif args.mode == "audio_analyzer":
	run_audio_analyzer_script(
	input_path=args.input_path,
	)
	elif args.mode == "api":
	run_api_script(
	ip=args.host,
	port=args.port,
	)
	except Exception as error:
	print(f"An error occurred during execution: {error}")

	import traceback

	traceback.print_exc()


	if __name__ == "__main__":
	main()