FunAudioLLM
/

InspireMusic-Base

Text-to-Audio

Safetensors

English

qwen2

music_generation

Model card Files Files and versions Community

Chong Zhang commited on Dec 12, 2024

Commit

3240095

verified ·

1 Parent(s): f8621c8

Upload inspiremusic.yaml

Browse files

Files changed (1) hide show

inspiremusic.yaml +7 -11

inspiremusic.yaml CHANGED Viewed

@@ -9,19 +9,17 @@ sample_rate: 24000
 text_encoder_input_size: 512
 llm_input_size: 896
 llm_output_size: 896
-spk_embed_dim: 192
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
-llm: !new:inspiremusic.llm.plm.PLM
     text_encoder_input_size: !ref <text_encoder_input_size>
     llm_input_size: !ref <llm_input_size>
     llm_output_size: !ref <llm_output_size>
     audio_token_size: 4096
     length_normalized_loss: True
     lsm_weight: 0
-    spk_embed_dim: !ref <spk_embed_dim>
     text_encoder_conf:
         name: "none"
     llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
@@ -36,11 +34,10 @@ llm: !new:inspiremusic.llm.plm.PLM
         win_size: 10
         tau_r: 0.1
     train_cfg_ratio: 0.2
-    infer_cfg_ratio: 3.0
-flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
     input_size: 256
     output_size: 80
-    spk_embed_dim: !ref <spk_embed_dim>
     output_type: 'mel'
     vocab_size: 4096
     input_frame_rate: 75
@@ -65,8 +62,6 @@ flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
         sampling_ratios: [1, 1, 1, 1]
     decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
         in_channels: 240
-        n_spks: 1
-        spk_emb_dim: 80
         cfm_params: !new:omegaconf.DictConfig
             content:
                 sigma_min: 1e-06
@@ -85,6 +80,7 @@ flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
             num_mid_blocks: 8
             num_heads: 8
             act_fn: 'gelu'
 hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
     in_channels: 80
@@ -147,9 +143,8 @@ sort: !name:inspiremusic.dataset.processor.sort
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:inspiremusic.dataset.processor.batch
     batch_type: 'dynamic'
-    max_frames_in_batch: 30000
 padding: !name:inspiremusic.dataset.processor.padding
-    use_spk_embedding: False # change to True during sft
 # dataset processor pipeline
 data_pipeline: [
@@ -162,6 +157,7 @@ data_pipeline: [
     !ref <padding>,
 ]
 # train conf
 train_conf:
     optim: adam
@@ -169,7 +165,7 @@ train_conf:
         lr: 0.00001 # change to 0.001 if you want to train flow from scratch
     scheduler: warmuplr
     scheduler_conf:
-        warmup_steps: 1000
     max_epoch: 200
     grad_clip: 5
     accum_grad: 2

 text_encoder_input_size: 512
 llm_input_size: 896
 llm_output_size: 896
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
     text_encoder_input_size: !ref <text_encoder_input_size>
     llm_input_size: !ref <llm_input_size>
     llm_output_size: !ref <llm_output_size>
     audio_token_size: 4096
     length_normalized_loss: True
     lsm_weight: 0
     text_encoder_conf:
         name: "none"
     llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
         win_size: 10
         tau_r: 0.1
     train_cfg_ratio: 0.2
+    infer_cfg_ratio: 7.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
     input_size: 256
     output_size: 80
     output_type: 'mel'
     vocab_size: 4096
     input_frame_rate: 75
         sampling_ratios: [1, 1, 1, 1]
     decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
         in_channels: 240
         cfm_params: !new:omegaconf.DictConfig
             content:
                 sigma_min: 1e-06
             num_mid_blocks: 8
             num_heads: 8
             act_fn: 'gelu'
+    generator_model_dir: ../../pretrained_models/InspireMusic-Base/music_tokenizer
 hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
     in_channels: 80
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:inspiremusic.dataset.processor.batch
     batch_type: 'dynamic'
+    max_frames_in_batch: 10000
 padding: !name:inspiremusic.dataset.processor.padding
 # dataset processor pipeline
 data_pipeline: [
     !ref <padding>,
 ]
 # train conf
 train_conf:
     optim: adam
         lr: 0.00001 # change to 0.001 if you want to train flow from scratch
     scheduler: warmuplr
     scheduler_conf:
+        warmup_steps: 500
     max_epoch: 200
     grad_clip: 5
     accum_grad: 2