Spaces:
Sleeping
Sleeping
root
commited on
Commit
·
f11ac57
1
Parent(s):
ca2a2a9
initial commit
Browse files- app.py +1 -1
- configs/inference.yaml +9 -202
app.py
CHANGED
@@ -28,7 +28,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
28 |
|
29 |
api_key = os.getenv("my_secret")
|
30 |
|
31 |
-
snapshot_download(repo_id="
|
32 |
|
33 |
config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
|
34 |
|
|
|
28 |
|
29 |
api_key = os.getenv("my_secret")
|
30 |
|
31 |
+
snapshot_download(repo_id="nvidia/audio-flamingo-2-1.5B", local_dir="./", token=api_key)
|
32 |
|
33 |
config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
|
34 |
|
configs/inference.yaml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
train_config:
|
2 |
-
expdir: /
|
3 |
-
run_name:
|
4 |
delete_previous_checkpoint: true
|
5 |
batch_size: 8
|
6 |
gradient_accumulation_steps: 2
|
@@ -24,216 +24,23 @@ train_config:
|
|
24 |
fsdp_sharding_strategy: full # full, hybrid
|
25 |
horovod: false
|
26 |
|
27 |
-
# instruction tuning hparams
|
28 |
-
# sft_config:
|
29 |
-
# pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
|
30 |
-
# pretrained_ckpt: checkpoint_199.pt
|
31 |
-
# unfreeze_full_lm: false
|
32 |
-
|
33 |
data_config:
|
34 |
dataset_blending_global_weight: 0.005
|
35 |
|
36 |
dataset_blending_config:
|
37 |
|
38 |
-
|
39 |
weight: 1.5
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
CompA-R-AQA/train:
|
45 |
-
weight: 1.0
|
46 |
-
|
47 |
-
# Audio QA
|
48 |
-
Clotho-AQA-AQA/train:
|
49 |
-
weight: 1.0
|
50 |
-
|
51 |
-
OpenAQA-AQA/train:
|
52 |
-
weight: 1.0
|
53 |
-
|
54 |
-
SalmonnQA/train:
|
55 |
-
weight: 1.0
|
56 |
-
|
57 |
-
AudioEntailmentQA/train:
|
58 |
-
weight: 1.0
|
59 |
-
|
60 |
-
# Audio Captioning
|
61 |
-
|
62 |
-
Clotho-v2-AudioCaptioning/train:
|
63 |
-
weight: 1.0
|
64 |
-
|
65 |
-
audiocaps-AudioCaptioning/train:
|
66 |
-
weight: 1.0
|
67 |
-
|
68 |
-
Epidemic_sound-AudioCaptioning/train:
|
69 |
-
weight: 1.0
|
70 |
-
|
71 |
-
MACS-AudioCaptioning/train:
|
72 |
-
weight: 1.0
|
73 |
-
|
74 |
-
# Audio Classification
|
75 |
-
|
76 |
-
FSD50k-EventClassification/train:
|
77 |
-
weight: 1.0
|
78 |
-
|
79 |
-
CochlScene-SceneClassification/train:
|
80 |
-
weight: 1.0
|
81 |
-
|
82 |
-
NonSpeech7k-EventClassification/train:
|
83 |
-
weight: 1.0
|
84 |
-
|
85 |
-
chime-home-EventClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
|
88 |
-
SONYC-UST-EventClassification/train:
|
89 |
-
weight: 1.0
|
90 |
-
|
91 |
-
# Speech Emotion Classification
|
92 |
-
|
93 |
-
MELD-EmotionClassification/train:
|
94 |
-
weight: 0.5
|
95 |
-
|
96 |
-
MELD-SentimentClassification/train:
|
97 |
-
weight: 0.5
|
98 |
-
|
99 |
-
emov-db-EmotionClassification/train:
|
100 |
-
weight: 1.0
|
101 |
-
|
102 |
-
jl-corpus-EmotionClassification/train:
|
103 |
-
weight: 6.0
|
104 |
-
|
105 |
-
tess-EmotionClassification/train:
|
106 |
-
weight: 2.5
|
107 |
-
|
108 |
-
IEMOCAP-EmotionClassification/train:
|
109 |
-
weight: 3.0
|
110 |
-
|
111 |
-
OMGEmotion-EmotionClassification/train:
|
112 |
-
weight: 3.0
|
113 |
-
|
114 |
-
VocalSound-VocalClassification/train:
|
115 |
-
weight: 1.5
|
116 |
-
|
117 |
-
# Music QA
|
118 |
-
|
119 |
-
Music-AVQA-AQA_All/train:
|
120 |
-
weight: 3.0
|
121 |
-
|
122 |
-
MU-LLAMA-AQA/train:
|
123 |
-
weight: 1.0
|
124 |
-
|
125 |
-
# Music Captioning
|
126 |
-
|
127 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
128 |
-
weight: 0.06
|
129 |
-
|
130 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
131 |
-
weight: 2.0
|
132 |
-
|
133 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
134 |
-
weight: 1.0
|
135 |
-
|
136 |
-
MusicCaps-AudioCaptioning/train:
|
137 |
-
weight: 6.0
|
138 |
-
|
139 |
-
musdbhq-captioning/train:
|
140 |
-
weight: 2.0
|
141 |
-
|
142 |
-
# Music Understanding
|
143 |
-
|
144 |
-
NSynth-MIR/train:
|
145 |
-
weight: 0.2
|
146 |
-
|
147 |
-
mtg-jamendo-MusicTagging/train:
|
148 |
-
weight: 0.1
|
149 |
-
|
150 |
-
FMA-GenreClassification/train:
|
151 |
-
weight: 0.5
|
152 |
-
|
153 |
-
musdbhq-InstrClassification/train:
|
154 |
-
weight: 0.8
|
155 |
-
|
156 |
-
LLARK_FMA-mir/train:
|
157 |
-
weight: 1.0
|
158 |
-
|
159 |
-
LLARK_FMA-reasoning/train:
|
160 |
-
weight: 1.0
|
161 |
-
|
162 |
-
LLARK_MagnaTagATune-mir/train:
|
163 |
-
weight: 1.0
|
164 |
-
|
165 |
-
LLARK_MTG-Jamendo-reasoning/train:
|
166 |
-
weight: 1.0
|
167 |
-
|
168 |
-
LLARK_MagnaTagATune-reasoning/train:
|
169 |
-
weight: 1.0
|
170 |
-
|
171 |
-
LLARK_MTG-Jamendo-mir/train:
|
172 |
-
weight: 1.0
|
173 |
-
|
174 |
-
MusicBenchQA/train:
|
175 |
-
weight: 1.0
|
176 |
-
|
177 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
|
178 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
179 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
|
180 |
max_tokens: 512
|
181 |
num_workers: 4
|
182 |
|
183 |
valid_dataset_config:
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
Clotho-v2-AudioCaptioning/test: true
|
188 |
-
audiocaps-AudioCaptioning/test: true
|
189 |
-
|
190 |
-
FSD50k-EventClassification/test: true
|
191 |
-
CochlScene-SceneClassification/test: true
|
192 |
-
NonSpeech7k-EventClassification/test: true
|
193 |
-
SONYC-UST-EventClassification/test: true
|
194 |
-
|
195 |
-
MELD-EmotionClassification/test: true
|
196 |
-
MELD-SentimentClassification/test: true
|
197 |
-
emov-db-EmotionClassification/val: true
|
198 |
-
jl-corpus-EmotionClassification/val: true
|
199 |
-
tess-EmotionClassification/val: true
|
200 |
-
IEMOCAP-EmotionClassification/val: true
|
201 |
-
OMGEmotion-EmotionClassification/val: true
|
202 |
-
VocalSound-VocalClassification/test: true
|
203 |
-
|
204 |
-
Music-AVQA-AQA_All/test: true
|
205 |
-
MU-LLAMA-AQA/test: true
|
206 |
-
|
207 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
208 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
209 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
210 |
-
MusicCaps-AudioCaptioning/test: true
|
211 |
-
|
212 |
-
NSynth-MIR/test: true
|
213 |
-
mtg-jamendo-MusicTagging/val: true
|
214 |
-
musdbhq-InstrClassification/test: true
|
215 |
-
|
216 |
-
# # zero shot
|
217 |
-
# CREMA-D-EmotionClassification/train:
|
218 |
-
# prefix_prob: 1.0
|
219 |
-
|
220 |
-
# ravdess-EmotionClassification/train:
|
221 |
-
# prefix_prob: 1.0
|
222 |
-
|
223 |
-
# UrbanSound8K-EventClassification/train:
|
224 |
-
# prefix_prob: 1.0
|
225 |
-
|
226 |
-
# ESC50-EventClassification/train:
|
227 |
-
# prefix_prob: 1.0
|
228 |
-
|
229 |
-
# DCASE17Task4-SceneClassification/test:
|
230 |
-
# prefix_prob: 1.0
|
231 |
-
|
232 |
-
# GTZAN-GenreClassification/train:
|
233 |
-
# prefix_prob: 1.0
|
234 |
-
|
235 |
-
# Medley-solos-DB-InstrClassification/test:
|
236 |
-
# prefix_prob: 1.0
|
237 |
|
238 |
clap_config:
|
239 |
method: nvclap-large
|
@@ -271,8 +78,8 @@ mert_config:
|
|
271 |
model_config:
|
272 |
cache_dir: .cache
|
273 |
|
274 |
-
lang_encoder_path: Qwen/Qwen2.5-
|
275 |
-
tokenizer_path: Qwen/Qwen2.5-
|
276 |
cross_attn_every_n_layers: 1
|
277 |
audio_transformer_kwargs: {
|
278 |
n_head: 8,
|
|
|
1 |
train_config:
|
2 |
+
expdir: /dummy/
|
3 |
+
run_name: /dummy/
|
4 |
delete_previous_checkpoint: true
|
5 |
batch_size: 8
|
6 |
gradient_accumulation_steps: 2
|
|
|
24 |
fsdp_sharding_strategy: full # full, hybrid
|
25 |
horovod: false
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
data_config:
|
28 |
dataset_blending_global_weight: 0.005
|
29 |
|
30 |
dataset_blending_config:
|
31 |
|
32 |
+
dummy/dummy:
|
33 |
weight: 1.5
|
34 |
|
35 |
+
dataset_file_root: dummy
|
36 |
+
data_root: dummy
|
37 |
+
dataset_blending_output: dummy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
max_tokens: 512
|
39 |
num_workers: 4
|
40 |
|
41 |
valid_dataset_config:
|
42 |
|
43 |
+
dummy/test: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
clap_config:
|
46 |
method: nvclap-large
|
|
|
78 |
model_config:
|
79 |
cache_dir: .cache
|
80 |
|
81 |
+
lang_encoder_path: Qwen/Qwen2.5-1.5B
|
82 |
+
tokenizer_path: Qwen/Qwen2.5-1.5B
|
83 |
cross_attn_every_n_layers: 1
|
84 |
audio_transformer_kwargs: {
|
85 |
n_head: 8,
|