Spaces:
Sleeping
Sleeping
train_config: | |
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b | |
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft | |
delete_previous_checkpoint: true | |
batch_size: 32 | |
gradient_accumulation_steps: 2 | |
seed: 42 | |
learning_rate: 0.00002 | |
lr_scheduler: constant | |
loss_multiplier: 1.0 | |
warmup_steps: 1875 | |
weight_decay: 0.1 | |
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"] | |
gradient_checkpointing: False | |
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1 | |
offline: false | |
freeze_lm_embeddings: false | |
logging_steps: 10 | |
dist_backend: nccl | |
dist_url: env:// # tcp://localhost:7000 | |
no_set_device_rank: false | |
fsdp: true | |
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT. | |
fsdp_sharding_strategy: full # full, hybrid | |
horovod: false | |
# instruction tuning hparams | |
# sft_config: | |
# pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/ | |
# pretrained_ckpt: checkpoint_199.pt | |
# unfreeze_full_lm: false | |
data_config: | |
dataset_blending_global_weight: 0.005 | |
dataset_blending_config: | |
MMAUQA/train: | |
weight: 1.5 | |
AudioSet-Temporal-Speech-Audio-QA/train: | |
weight: 1.0 | |
CompA-R-AQA/train: | |
weight: 1.0 | |
# Audio QA | |
Clotho-AQA-AQA/train: | |
weight: 1.0 | |
OpenAQA-AQA/train: | |
weight: 1.0 | |
SalmonnQA/train: | |
weight: 0.8 | |
AudioEntailmentQA/train: | |
weight: 1.0 | |
# Audio Captioning | |
Clotho-v2-AudioCaptioning/train: | |
weight: 1.0 | |
audiocaps-AudioCaptioning/train: | |
weight: 1.0 | |
Epidemic_sound-AudioCaptioning/train: | |
weight: 1.0 | |
MACS-AudioCaptioning/train: | |
weight: 1.0 | |
# Audio Classification | |
UrbanSound8K-EventClassification/train: | |
weight: 0.5 | |
TUT-EventClassification/train: | |
weight: 2.0 | |
FSD50k-EventClassification/train: | |
weight: 1.0 | |
CochlScene-SceneClassification/train: | |
weight: 1.0 | |
NonSpeech7k-EventClassification/train: | |
weight: 1.0 | |
chime-home-EventClassification/train: | |
weight: 1.0 | |
SONYC-UST-EventClassification/train: | |
weight: 1.0 | |
# Speech Emotion Classification | |
MELD-EmotionClassification/train: | |
weight: 0.5 | |
MELD-SentimentClassification/train: | |
weight: 0.5 | |
emov-db-EmotionClassification/train: | |
weight: 1.0 | |
jl-corpus-EmotionClassification/train: | |
weight: 6.0 | |
tess-EmotionClassification/train: | |
weight: 2.5 | |
IEMOCAP-EmotionClassification/train: | |
weight: 3.0 | |
OMGEmotion-EmotionClassification/train: | |
weight: 3.0 | |
VocalSound-VocalClassification/train: | |
weight: 1.5 | |
# Music QA | |
Music-AVQA-AQA_All/train: | |
weight: 3.0 | |
MU-LLAMA-AQA/train: | |
weight: 1.0 | |
# Music Captioning | |
LP-MusicCaps-MSD-AudioCaptioning/train: | |
weight: 0.06 | |
LP-MusicCaps-MC-AudioCaptioning/train: | |
weight: 2.0 | |
LP-MusicCaps-MTT-AudioCaptioning/train: | |
weight: 1.0 | |
MusicCaps-AudioCaptioning/train: | |
weight: 6.0 | |
musdbhq-captioning/train: | |
weight: 2.0 | |
# Music Understanding | |
Medley-solos-DB-InstrClassification/train: | |
weight: 1.5 | |
GTZAN-GenreClassification/train: | |
weight: 2.0 | |
NSynth-MIR/train: | |
weight: 0.4 | |
NSynth-Instrument/train: | |
weight: 1.5 | |
NSynth-Source/train: | |
weight: 1.5 | |
mtg-jamendo-MusicTagging/train: | |
weight: 1.0 | |
FMA-GenreClassification/train: | |
weight: 1.0 | |
musdbhq-InstrClassification/train: | |
weight: 1.0 | |
LLARK_FMA-mir/train: | |
weight: 1.0 | |
LLARK_FMA-reasoning/train: | |
weight: 1.0 | |
LLARK_MagnaTagATune-mir/train: | |
weight: 1.0 | |
LLARK_MTG-Jamendo-reasoning/train: | |
weight: 1.0 | |
LLARK_MagnaTagATune-reasoning/train: | |
weight: 1.0 | |
LLARK_MTG-Jamendo-mir/train: | |
weight: 1.0 | |
MusicBenchQA/train: | |
weight: 1.0 | |
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data | |
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets | |
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json | |
max_tokens: 512 | |
num_workers: 4 | |
valid_dataset_config: | |
Clotho-AQA-AQA/test: true | |
Clotho-v2-AudioCaptioning/test: true | |
audiocaps-AudioCaptioning/test: true | |
FSD50k-EventClassification/test: true | |
CochlScene-SceneClassification/test: true | |
NonSpeech7k-EventClassification/test: true | |
SONYC-UST-EventClassification/test: true | |
MELD-EmotionClassification/test: true | |
MELD-SentimentClassification/test: true | |
emov-db-EmotionClassification/val: true | |
jl-corpus-EmotionClassification/val: true | |
tess-EmotionClassification/val: true | |
IEMOCAP-EmotionClassification/val: true | |
OMGEmotion-EmotionClassification/val: true | |
VocalSound-VocalClassification/test: true | |
Music-AVQA-AQA_All/test: true | |
MU-LLAMA-AQA/test: true | |
LP-MusicCaps-MSD-AudioCaptioning/test: true | |
LP-MusicCaps-MC-AudioCaptioning/test: true | |
LP-MusicCaps-MTT-AudioCaptioning/test: true | |
MusicCaps-AudioCaptioning/test: true | |
NSynth-MIR/test: true | |
mtg-jamendo-MusicTagging/val: true | |
musdbhq-InstrClassification/test: true | |
# zero shot | |
# CREMA-D-EmotionClassification/train: | |
# prefix_prob: 1.0 | |
# ravdess-EmotionClassification/train: | |
# prefix_prob: 1.0 | |
# UrbanSound8K-EventClassification/train: | |
# prefix_prob: 1.0 | |
# ESC50-EventClassification/train: | |
# prefix_prob: 1.0 | |
# DCASE17Task4-SceneClassification/test: | |
# prefix_prob: 1.0 | |
# GTZAN-GenreClassification/train: | |
# prefix_prob: 1.0 | |
# Medley-solos-DB-InstrClassification/test: | |
# prefix_prob: 1.0 | |
clap_config: | |
method: nvclap-large | |
audio_embed_dim: 2048 | |
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt | |
window_length: 10.0 # seconds | |
window_overlap: 0.0 # seconds | |
max_num_window: 9 # 1.5 minutes | |
max_num_fewshot: 1 # number of fewshot samples (including the final one) | |
finetune: true | |
whisper_config: | |
method: whisper-large-v3 | |
path: openai/whisper-large-v3 | |
audio_embed_dim: 1280 | |
sampling_rate: 16000 | |
window_length: 30.0 # seconds | |
window_overlap: 0.0 # seconds | |
max_num_window: 1 # 5 minutes | |
max_num_fewshot: 1 # number of fewshot samples (including the final one) | |
mert_config: | |
method: mert-v1 | |
path: m-a-p/MERT-v1-330M | |
audio_embed_dim: 1024 | |
sampling_rate: 24000 | |
window_length: 10.0 # seconds | |
window_overlap: 0.0 # seconds | |
max_num_window: 1 # 5 minutes | |
max_num_fewshot: 1 # number of fewshot samples (including the final one) | |
model_config: | |
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache | |
lang_encoder_path: Qwen/Qwen2.5-1.5B | |
tokenizer_path: Qwen/Qwen2.5-1.5B | |
cross_attn_every_n_layers: 1 | |
audio_transformer_kwargs: { | |
n_head: 8, | |
n_layers: 3, | |
d_inner: 2048, | |
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4) | |
max_window_per_audio: 1, # must = max_num_window | |
common_encoder_embed_dim: 1024 | |
} |