audio-flamingo-2-0.5B

Sleeping

audio-flamingo-2-0.5B / configs /inference.yaml

root

initial commit

5f171b8 24 days ago

2.39 kB

	train_config:
	expdir: /dummy/
	run_name: /dummy/
	delete_previous_checkpoint: true
	batch_size: 8
	gradient_accumulation_steps: 2
	seed: 42
	learning_rate: 0.00002
	lr_scheduler: constant
	loss_multiplier: 1.0
	warmup_steps: 1875
	weight_decay: 0.1
	precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
	gradient_checkpointing: False
	num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
	offline: false
	freeze_lm_embeddings: false
	logging_steps: 10
	dist_backend: nccl
	dist_url: env:// # tcp://localhost:7000
	no_set_device_rank: false
	fsdp: true
	fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
	fsdp_sharding_strategy: full # full, hybrid
	horovod: false

	data_config:
	dataset_blending_global_weight: 0.005

	dataset_blending_config:

	dummy/dummy:
	weight: 1.5

	dataset_file_root: dummy
	data_root: dummy
	dataset_blending_output: dummy
	max_tokens: 512
	num_workers: 4

	valid_dataset_config:

	dummy/test: true

	clap_config:
	method: nvclap-large
	audio_embed_dim: 2048
	checkpoint: clap_ckpt/epoch_15.pt

	window_length: 10.0 # seconds
	window_overlap: 0.0 # seconds
	max_num_window: 9 # 1.5 minutes
	max_num_fewshot: 1 # number of fewshot samples (including the final one)
	finetune: true

	whisper_config:
	method: whisper-large-v3
	path: openai/whisper-large-v3
	audio_embed_dim: 1280
	sampling_rate: 16000

	window_length: 30.0 # seconds
	window_overlap: 0.0 # seconds
	max_num_window: 1 # 5 minutes
	max_num_fewshot: 1 # number of fewshot samples (including the final one)

	mert_config:
	method: mert-v1
	path: m-a-p/MERT-v1-330M
	audio_embed_dim: 1024
	sampling_rate: 24000

	window_length: 10.0 # seconds
	window_overlap: 0.0 # seconds
	max_num_window: 1 # 5 minutes
	max_num_fewshot: 1 # number of fewshot samples (including the final one)

	model_config:
	cache_dir: .cache

	lang_encoder_path: Qwen/Qwen2.5-0.5B
	tokenizer_path: Qwen/Qwen2.5-0.5B
	cross_attn_every_n_layers: 1
	audio_transformer_kwargs: {
	n_head: 8,
	n_layers: 3,
	d_inner: 2048,
	max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
	max_window_per_audio: 1, # must = max_num_window
	common_encoder_embed_dim: 1024
	}