root commited on
Commit
f11ac57
·
1 Parent(s): ca2a2a9

initial commit

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. configs/inference.yaml +9 -202
app.py CHANGED
@@ -28,7 +28,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
28
 
29
  api_key = os.getenv("my_secret")
30
 
31
- snapshot_download(repo_id="SreyanG-NVIDIA/audio-flamingo-2", local_dir="./", token=api_key)
32
 
33
  config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
34
 
 
28
 
29
  api_key = os.getenv("my_secret")
30
 
31
+ snapshot_download(repo_id="nvidia/audio-flamingo-2-1.5B", local_dir="./", token=api_key)
32
 
33
  config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
34
 
configs/inference.yaml CHANGED
@@ -1,6 +1,6 @@
1
  train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3
4
  delete_previous_checkpoint: true
5
  batch_size: 8
6
  gradient_accumulation_steps: 2
@@ -24,216 +24,23 @@ train_config:
24
  fsdp_sharding_strategy: full # full, hybrid
25
  horovod: false
26
 
27
- # instruction tuning hparams
28
- # sft_config:
29
- # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
- # pretrained_ckpt: checkpoint_199.pt
31
- # unfreeze_full_lm: false
32
-
33
  data_config:
34
  dataset_blending_global_weight: 0.005
35
 
36
  dataset_blending_config:
37
 
38
- MMAUQA/train:
39
  weight: 1.5
40
 
41
- AudioSet-Temporal-Speech-Audio-QA/train:
42
- weight: 1.0
43
-
44
- CompA-R-AQA/train:
45
- weight: 1.0
46
-
47
- # Audio QA
48
- Clotho-AQA-AQA/train:
49
- weight: 1.0
50
-
51
- OpenAQA-AQA/train:
52
- weight: 1.0
53
-
54
- SalmonnQA/train:
55
- weight: 1.0
56
-
57
- AudioEntailmentQA/train:
58
- weight: 1.0
59
-
60
- # Audio Captioning
61
-
62
- Clotho-v2-AudioCaptioning/train:
63
- weight: 1.0
64
-
65
- audiocaps-AudioCaptioning/train:
66
- weight: 1.0
67
-
68
- Epidemic_sound-AudioCaptioning/train:
69
- weight: 1.0
70
-
71
- MACS-AudioCaptioning/train:
72
- weight: 1.0
73
-
74
- # Audio Classification
75
-
76
- FSD50k-EventClassification/train:
77
- weight: 1.0
78
-
79
- CochlScene-SceneClassification/train:
80
- weight: 1.0
81
-
82
- NonSpeech7k-EventClassification/train:
83
- weight: 1.0
84
-
85
- chime-home-EventClassification/train:
86
- weight: 1.0
87
-
88
- SONYC-UST-EventClassification/train:
89
- weight: 1.0
90
-
91
- # Speech Emotion Classification
92
-
93
- MELD-EmotionClassification/train:
94
- weight: 0.5
95
-
96
- MELD-SentimentClassification/train:
97
- weight: 0.5
98
-
99
- emov-db-EmotionClassification/train:
100
- weight: 1.0
101
-
102
- jl-corpus-EmotionClassification/train:
103
- weight: 6.0
104
-
105
- tess-EmotionClassification/train:
106
- weight: 2.5
107
-
108
- IEMOCAP-EmotionClassification/train:
109
- weight: 3.0
110
-
111
- OMGEmotion-EmotionClassification/train:
112
- weight: 3.0
113
-
114
- VocalSound-VocalClassification/train:
115
- weight: 1.5
116
-
117
- # Music QA
118
-
119
- Music-AVQA-AQA_All/train:
120
- weight: 3.0
121
-
122
- MU-LLAMA-AQA/train:
123
- weight: 1.0
124
-
125
- # Music Captioning
126
-
127
- LP-MusicCaps-MSD-AudioCaptioning/train:
128
- weight: 0.06
129
-
130
- LP-MusicCaps-MC-AudioCaptioning/train:
131
- weight: 2.0
132
-
133
- LP-MusicCaps-MTT-AudioCaptioning/train:
134
- weight: 1.0
135
-
136
- MusicCaps-AudioCaptioning/train:
137
- weight: 6.0
138
-
139
- musdbhq-captioning/train:
140
- weight: 2.0
141
-
142
- # Music Understanding
143
-
144
- NSynth-MIR/train:
145
- weight: 0.2
146
-
147
- mtg-jamendo-MusicTagging/train:
148
- weight: 0.1
149
-
150
- FMA-GenreClassification/train:
151
- weight: 0.5
152
-
153
- musdbhq-InstrClassification/train:
154
- weight: 0.8
155
-
156
- LLARK_FMA-mir/train:
157
- weight: 1.0
158
-
159
- LLARK_FMA-reasoning/train:
160
- weight: 1.0
161
-
162
- LLARK_MagnaTagATune-mir/train:
163
- weight: 1.0
164
-
165
- LLARK_MTG-Jamendo-reasoning/train:
166
- weight: 1.0
167
-
168
- LLARK_MagnaTagATune-reasoning/train:
169
- weight: 1.0
170
-
171
- LLARK_MTG-Jamendo-mir/train:
172
- weight: 1.0
173
-
174
- MusicBenchQA/train:
175
- weight: 1.0
176
-
177
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
178
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
  max_tokens: 512
181
  num_workers: 4
182
 
183
  valid_dataset_config:
184
 
185
- Clotho-AQA-AQA/test: true
186
-
187
- Clotho-v2-AudioCaptioning/test: true
188
- audiocaps-AudioCaptioning/test: true
189
-
190
- FSD50k-EventClassification/test: true
191
- CochlScene-SceneClassification/test: true
192
- NonSpeech7k-EventClassification/test: true
193
- SONYC-UST-EventClassification/test: true
194
-
195
- MELD-EmotionClassification/test: true
196
- MELD-SentimentClassification/test: true
197
- emov-db-EmotionClassification/val: true
198
- jl-corpus-EmotionClassification/val: true
199
- tess-EmotionClassification/val: true
200
- IEMOCAP-EmotionClassification/val: true
201
- OMGEmotion-EmotionClassification/val: true
202
- VocalSound-VocalClassification/test: true
203
-
204
- Music-AVQA-AQA_All/test: true
205
- MU-LLAMA-AQA/test: true
206
-
207
- LP-MusicCaps-MSD-AudioCaptioning/test: true
208
- LP-MusicCaps-MC-AudioCaptioning/test: true
209
- LP-MusicCaps-MTT-AudioCaptioning/test: true
210
- MusicCaps-AudioCaptioning/test: true
211
-
212
- NSynth-MIR/test: true
213
- mtg-jamendo-MusicTagging/val: true
214
- musdbhq-InstrClassification/test: true
215
-
216
- # # zero shot
217
- # CREMA-D-EmotionClassification/train:
218
- # prefix_prob: 1.0
219
-
220
- # ravdess-EmotionClassification/train:
221
- # prefix_prob: 1.0
222
-
223
- # UrbanSound8K-EventClassification/train:
224
- # prefix_prob: 1.0
225
-
226
- # ESC50-EventClassification/train:
227
- # prefix_prob: 1.0
228
-
229
- # DCASE17Task4-SceneClassification/test:
230
- # prefix_prob: 1.0
231
-
232
- # GTZAN-GenreClassification/train:
233
- # prefix_prob: 1.0
234
-
235
- # Medley-solos-DB-InstrClassification/test:
236
- # prefix_prob: 1.0
237
 
238
  clap_config:
239
  method: nvclap-large
@@ -271,8 +78,8 @@ mert_config:
271
  model_config:
272
  cache_dir: .cache
273
 
274
- lang_encoder_path: Qwen/Qwen2.5-3B
275
- tokenizer_path: Qwen/Qwen2.5-3B
276
  cross_attn_every_n_layers: 1
277
  audio_transformer_kwargs: {
278
  n_head: 8,
 
1
  train_config:
2
+ expdir: /dummy/
3
+ run_name: /dummy/
4
  delete_previous_checkpoint: true
5
  batch_size: 8
6
  gradient_accumulation_steps: 2
 
24
  fsdp_sharding_strategy: full # full, hybrid
25
  horovod: false
26
 
 
 
 
 
 
 
27
  data_config:
28
  dataset_blending_global_weight: 0.005
29
 
30
  dataset_blending_config:
31
 
32
+ dummy/dummy:
33
  weight: 1.5
34
 
35
+ dataset_file_root: dummy
36
+ data_root: dummy
37
+ dataset_blending_output: dummy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  max_tokens: 512
39
  num_workers: 4
40
 
41
  valid_dataset_config:
42
 
43
+ dummy/test: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  clap_config:
46
  method: nvclap-large
 
78
  model_config:
79
  cache_dir: .cache
80
 
81
+ lang_encoder_path: Qwen/Qwen2.5-1.5B
82
+ tokenizer_path: Qwen/Qwen2.5-1.5B
83
  cross_attn_every_n_layers: 1
84
  audio_transformer_kwargs: {
85
  n_head: 8,