train_config: expdir: /dummy/ run_name: /dummy/ delete_previous_checkpoint: true batch_size: 8 gradient_accumulation_steps: 2 seed: 42 learning_rate: 0.00002 lr_scheduler: constant loss_multiplier: 1.0 warmup_steps: 1875 weight_decay: 0.1 precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"] gradient_checkpointing: False num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1 offline: false freeze_lm_embeddings: false logging_steps: 10 dist_backend: nccl dist_url: env:// # tcp://localhost:7000 no_set_device_rank: false fsdp: true fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT. fsdp_sharding_strategy: full # full, hybrid horovod: false data_config: dataset_blending_global_weight: 0.005 dataset_blending_config: dummy/dummy: weight: 1.5 dataset_file_root: dummy data_root: dummy dataset_blending_output: dummy max_tokens: 512 num_workers: 4 valid_dataset_config: dummy/test: true clap_config: method: nvclap-large audio_embed_dim: 2048 checkpoint: clap_ckpt/epoch_15.pt window_length: 10.0 # seconds window_overlap: 0.0 # seconds max_num_window: 9 # 1.5 minutes max_num_fewshot: 1 # number of fewshot samples (including the final one) finetune: true whisper_config: method: whisper-large-v3 path: openai/whisper-large-v3 audio_embed_dim: 1280 sampling_rate: 16000 window_length: 30.0 # seconds window_overlap: 0.0 # seconds max_num_window: 1 # 5 minutes max_num_fewshot: 1 # number of fewshot samples (including the final one) mert_config: method: mert-v1 path: m-a-p/MERT-v1-330M audio_embed_dim: 1024 sampling_rate: 24000 window_length: 10.0 # seconds window_overlap: 0.0 # seconds max_num_window: 1 # 5 minutes max_num_fewshot: 1 # number of fewshot samples (including the final one) model_config: cache_dir: .cache lang_encoder_path: Qwen/Qwen2.5-0.5B tokenizer_path: Qwen/Qwen2.5-0.5B cross_attn_every_n_layers: 1 audio_transformer_kwargs: { n_head: 8, n_layers: 3, d_inner: 2048, max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4) max_window_per_audio: 1, # must = max_num_window common_encoder_embed_dim: 1024 }