FunAudioLLM
/

InspireMusic-Base

Text-to-Audio

Safetensors

English

qwen2

music_generation

Model card Files Files and versions Community

Chong Zhang commited on 23 days ago

Commit

3bdbe06

verified ·

1 Parent(s): 1b65ad1

init

Browse files

Files changed (1) hide show

inspiremusic.yaml +24 -17

inspiremusic.yaml CHANGED Viewed

@@ -6,10 +6,14 @@ __set_seed4: !apply:torch.cuda.manual_seed_all [1024]
 # fixed params
 sample_rate: 24000
 text_encoder_input_size: 512
 llm_input_size: 896
 llm_output_size: 896
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
@@ -24,17 +28,12 @@ llm: !new:inspiremusic.llm.llm.LLM
         name: "none"
     llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
         input_size: !ref <text_encoder_input_size>
-        pretrain_path: ../../pretrained_models/InspireMusic-Base/
-    # sampling: !name:inspiremusic.utils.common.topk_sampling
-    #     top_k: 350
-    sampling: !name:inspiremusic.utils.common.ras_sampling
-        top_p: 0.8
-        top_k: 50
-        win_size: 10
-        tau_r: 0.1
     train_cfg_ratio: 0.2
-    infer_cfg_ratio: 7.0
 flow: !new:inspiremusic.flow.flow.MaskedDiff
     input_size: 256
     output_size: 80
@@ -80,7 +79,7 @@ flow: !new:inspiremusic.flow.flow.MaskedDiff
             num_mid_blocks: 8
             num_heads: 8
             act_fn: 'gelu'
-    generator_model_dir: ../../pretrained_models/InspireMusic-Base/music_tokenizer
 hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
     in_channels: 80
@@ -111,19 +110,24 @@ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
 # processor functions
 parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
 get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
-    tokenizer_path: "../../pretrained_models/InspireMusic-Base/"
     tokenizer_name: "qwen-2.0"
 allowed_special: 'all'
 tokenize: !name:inspiremusic.dataset.processor.tokenize
     get_tokenizer: !ref <get_tokenizer>
     allowed_special: !ref <allowed_special>
 filter: !name:inspiremusic.dataset.processor.filter
-    max_length: 28000
-    min_length: 0
     token_max_length: 200
     token_min_length: 1
 resample: !name:inspiremusic.dataset.processor.resample
     resample_rate: !ref <sample_rate>
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     n_fft: 1024
     num_mels: 128
@@ -131,7 +135,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     hop_size: 256
     win_size: 1024
     fmin: 0
-    fmax: 12000
     center: False
 compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
     feat_extractor: !ref <feat_extractor>
@@ -143,8 +147,11 @@ sort: !name:inspiremusic.dataset.processor.sort
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:inspiremusic.dataset.processor.batch
     batch_type: 'dynamic'
-    max_frames_in_batch: 10000
 padding: !name:inspiremusic.dataset.processor.padding
 # dataset processor pipeline
 data_pipeline: [
@@ -162,7 +169,7 @@ data_pipeline: [
 train_conf:
     optim: adam
     optim_conf:
-        lr: 0.00001 # change to 0.001 if you want to train flow from scratch
     scheduler: warmuplr
     scheduler_conf:
         warmup_steps: 500
@@ -170,4 +177,4 @@ train_conf:
     grad_clip: 5
     accum_grad: 2
     log_interval: 100
-    save_per_step: 10000

 # fixed params
 sample_rate: 24000
+target_sample_rate: 48000
 text_encoder_input_size: 512
 llm_input_size: 896
 llm_output_size: 896
+basemodel_path: '../../pretrained_models/InspireMusic-Base/'
+generator_path: '../../pretrained_models/InspireMusic-Base/music_tokenizer'
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
         name: "none"
     llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
         input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
     train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
 flow: !new:inspiremusic.flow.flow.MaskedDiff
     input_size: 256
     output_size: 80
             num_mid_blocks: 8
             num_heads: 8
             act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
 hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
     in_channels: 80
 # processor functions
 parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
 get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
     tokenizer_name: "qwen-2.0"
 allowed_special: 'all'
 tokenize: !name:inspiremusic.dataset.processor.tokenize
     get_tokenizer: !ref <get_tokenizer>
     allowed_special: !ref <allowed_special>
 filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 20000
+    min_length: 1
     token_max_length: 200
     token_min_length: 1
+    max_acoustic_length: 20000
+    min_acoustic_length: 1800
+    mode: 'train_flow'
 resample: !name:inspiremusic.dataset.processor.resample
     resample_rate: !ref <sample_rate>
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     n_fft: 1024
     num_mels: 128
     hop_size: 256
     win_size: 1024
     fmin: 0
+    fmax: 24000
     center: False
 compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
     feat_extractor: !ref <feat_extractor>
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:inspiremusic.dataset.processor.batch
     batch_type: 'dynamic'
+    max_frames_in_batch: 15500 # llm 12000
+    # batch_type: 'static'
+    # batch_size: 2 # llm 12000
 padding: !name:inspiremusic.dataset.processor.padding
+    mode: 'train'
 # dataset processor pipeline
 data_pipeline: [
 train_conf:
     optim: adam
     optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
     scheduler: warmuplr
     scheduler_conf:
         warmup_steps: 500
     grad_clip: 5
     accum_grad: 2
     log_interval: 100
+    save_per_step: 500