File size: 2,386 Bytes
a344f64
f11ac57
 
a344f64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f11ac57
a344f64
 
f11ac57
 
 
a344f64
 
 
 
 
f11ac57
a344f64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bca6b51
a344f64
5f171b8
 
a344f64
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
train_config:
  expdir: /dummy/
  run_name: /dummy/
  delete_previous_checkpoint: true 
  batch_size: 8
  gradient_accumulation_steps: 2
  seed: 42
  learning_rate: 0.00002
  lr_scheduler: constant
  loss_multiplier: 1.0
  warmup_steps: 1875
  weight_decay: 0.1
  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
  gradient_checkpointing: False 
  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
  offline: false
  freeze_lm_embeddings: false
  logging_steps: 10
  dist_backend: nccl
  dist_url: env:// # tcp://localhost:7000
  no_set_device_rank: false 
  fsdp: true 
  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
  fsdp_sharding_strategy: full  # full, hybrid
  horovod: false

data_config:
  dataset_blending_global_weight: 0.005

  dataset_blending_config:

    dummy/dummy:
      weight: 1.5

  dataset_file_root: dummy
  data_root: dummy
  dataset_blending_output: dummy
  max_tokens: 512
  num_workers: 4

  valid_dataset_config: 
  
    dummy/test: true

clap_config:  
  method: nvclap-large
  audio_embed_dim: 2048
  checkpoint: clap_ckpt/epoch_15.pt

  window_length: 10.0  # seconds
  window_overlap: 0.0  # seconds
  max_num_window: 9  # 1.5 minutes
  max_num_fewshot: 1  # number of fewshot samples (including the final one)
  finetune: true

whisper_config:  
  method: whisper-large-v3
  path: openai/whisper-large-v3
  audio_embed_dim: 1280
  sampling_rate: 16000

  window_length: 30.0  # seconds
  window_overlap: 0.0  # seconds
  max_num_window: 1  # 5 minutes
  max_num_fewshot: 1  # number of fewshot samples (including the final one)

mert_config:  
  method: mert-v1
  path: m-a-p/MERT-v1-330M
  audio_embed_dim: 1024
  sampling_rate: 24000

  window_length: 10.0  # seconds
  window_overlap: 0.0  # seconds
  max_num_window: 1  # 5 minutes
  max_num_fewshot: 1  # number of fewshot samples (including the final one)

model_config:
  cache_dir: .cache

  lang_encoder_path: Qwen/Qwen2.5-0.5B
  tokenizer_path: Qwen/Qwen2.5-0.5B
  cross_attn_every_n_layers: 1
  audio_transformer_kwargs: {
    n_head: 8,
    n_layers: 3,
    d_inner: 2048,
    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
    max_window_per_audio: 1,  # must = max_num_window
    common_encoder_embed_dim: 1024
  }