Spaces:
Running
on
T4
Running
on
T4
File size: 2,386 Bytes
a344f64 f11ac57 a344f64 f11ac57 a344f64 f11ac57 a344f64 f11ac57 a344f64 bca6b51 a344f64 5f171b8 a344f64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
train_config:
expdir: /dummy/
run_name: /dummy/
delete_previous_checkpoint: true
batch_size: 8
gradient_accumulation_steps: 2
seed: 42
learning_rate: 0.00002
lr_scheduler: constant
loss_multiplier: 1.0
warmup_steps: 1875
weight_decay: 0.1
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
gradient_checkpointing: False
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
offline: false
freeze_lm_embeddings: false
logging_steps: 10
dist_backend: nccl
dist_url: env:// # tcp://localhost:7000
no_set_device_rank: false
fsdp: true
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
fsdp_sharding_strategy: full # full, hybrid
horovod: false
data_config:
dataset_blending_global_weight: 0.005
dataset_blending_config:
dummy/dummy:
weight: 1.5
dataset_file_root: dummy
data_root: dummy
dataset_blending_output: dummy
max_tokens: 512
num_workers: 4
valid_dataset_config:
dummy/test: true
clap_config:
method: nvclap-large
audio_embed_dim: 2048
checkpoint: clap_ckpt/epoch_15.pt
window_length: 10.0 # seconds
window_overlap: 0.0 # seconds
max_num_window: 9 # 1.5 minutes
max_num_fewshot: 1 # number of fewshot samples (including the final one)
finetune: true
whisper_config:
method: whisper-large-v3
path: openai/whisper-large-v3
audio_embed_dim: 1280
sampling_rate: 16000
window_length: 30.0 # seconds
window_overlap: 0.0 # seconds
max_num_window: 1 # 5 minutes
max_num_fewshot: 1 # number of fewshot samples (including the final one)
mert_config:
method: mert-v1
path: m-a-p/MERT-v1-330M
audio_embed_dim: 1024
sampling_rate: 24000
window_length: 10.0 # seconds
window_overlap: 0.0 # seconds
max_num_window: 1 # 5 minutes
max_num_fewshot: 1 # number of fewshot samples (including the final one)
model_config:
cache_dir: .cache
lang_encoder_path: Qwen/Qwen2.5-0.5B
tokenizer_path: Qwen/Qwen2.5-0.5B
cross_attn_every_n_layers: 1
audio_transformer_kwargs: {
n_head: 8,
n_layers: 3,
d_inner: 2048,
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
max_window_per_audio: 1, # must = max_num_window
common_encoder_embed_dim: 1024
} |