ecker
/

tortoise-tts-models

Model card Files Files and versions Community

tortoise-tts-models / finetunes /james-sunderland /dataset /train.yaml

ecker

added: SH2 James dataset

a669b9a over 1 year ago

raw

history blame

4.38 kB

	name: james-finetune
	model: extensibletrainer
	scale: 1
	gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
	start_step: 0
	checkpointing_enabled: true
	fp16: false
	wandb: false
	use_tb_logger: true

	datasets:
	train:
	name: james-train
	n_workers: 2
	batch_size: 32
	mode: paired_voice_audio
	path: ./training/james/train.txt
	fetcher_mode: ['lj']
	phase: train
	max_wav_length: 255995
	max_text_length: 200
	sample_rate: 22050
	load_conditioning: True
	num_conditioning_candidates: 2
	conditioning_length: 44000
	use_bpe_tokenizer: True
	tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
	load_aligned_codes: False
	val: # I really do not care about validation right now
	name: james-val
	n_workers: 1
	batch_size: 1
	mode: paired_voice_audio
	path: ./training/james/train.txt
	fetcher_mode: ['lj']
	phase: val
	max_wav_length: 255995
	max_text_length: 200
	sample_rate: 22050
	load_conditioning: True
	num_conditioning_candidates: 2
	conditioning_length: 44000
	use_bpe_tokenizer: True
	tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
	load_aligned_codes: False

	steps:
	gpt_train:
	training: gpt
	loss_log_buffer: 500

	# Generally follows the recipe from the DALLE paper.
	optimizer: adamw # this should be adamw_zero if you're using distributed training
	optimizer_params:
	lr: !!float 0.0001 # originally: 1e-4
	weight_decay: !!float 1e-2
	beta1: 0.9
	beta2: 0.96
	clip_grad_eps: 4

	injectors:
	paired_to_mel:
	type: torch_mel_spectrogram
	mel_norm_file: ./models/tortoise/clips_mel_norms.pth
	in: wav
	out: paired_mel
	paired_cond_to_mel:
	type: for_each
	subtype: torch_mel_spectrogram
	mel_norm_file: ./models/tortoise/clips_mel_norms.pth
	in: conditioning
	out: paired_conditioning_mel
	to_codes:
	type: discrete_token
	in: paired_mel
	out: paired_mel_codes
	dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
	paired_fwd_text:
	type: generator
	generator: gpt
	in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
	out: [loss_text_ce, loss_mel_ce, logits]
	losses:
	text_ce:
	type: direct
	weight: 0.01
	key: loss_text_ce
	mel_ce:
	type: direct
	weight: 1
	key: loss_mel_ce

	networks:
	gpt:
	type: generator
	which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
	kwargs:
	layers: 30 # originally: 8
	model_dim: 1024 # originally: 512
	heads: 16 # originally: 8
	max_text_tokens: 402 # originally: 120
	max_mel_tokens: 604 # originally: 250
	max_conditioning_inputs: 2 # originally: 1
	mel_length_compression: 1024
	number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
	number_mel_codes: 8194
	start_mel_token: 8192
	stop_mel_token: 8193
	start_text_token: 255
	train_solo_embeddings: False # missing in uv3/4
	use_mel_codes_as_input: True # ditto
	checkpointing: True
	#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
	#only_alignment_head: False # uv3/4

	path:
	pretrain_model_gpt: './models/tortoise/autoregressive.pth'
	strict_load: true
	# resume_state: './training/james-finetune//training_state//50.state'

	train:
	niter: 500
	warmup_iter: -1
	mega_batch_factor: 16
	val_freq: 500

	ema_enabled: false # I really don't think EMA matters

	default_lr_scheme: MultiStepLR
	gen_lr_steps: [9, 18, 25, 33] #[50000, 100000, 140000, 180000]
	lr_gamma: 0.5

	eval:
	output_state: gen
	injectors:
	gen_inj_eval:
	type: generator
	generator: generator
	in: hq
	out: [gen, codebook_commitment_loss]

	logger:
	print_freq: 5
	save_checkpoint_freq: 25
	visuals: [gen, mel]
	visual_debug_rate: 5
	is_mel_spectrogram: true