Chong Zhang commited on
Commit
3240095
·
verified ·
1 Parent(s): f8621c8

Upload inspiremusic.yaml

Browse files
Files changed (1) hide show
  1. inspiremusic.yaml +7 -11
inspiremusic.yaml CHANGED
@@ -9,19 +9,17 @@ sample_rate: 24000
9
  text_encoder_input_size: 512
10
  llm_input_size: 896
11
  llm_output_size: 896
12
- spk_embed_dim: 192
13
 
14
  # model params
15
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
  # for system/third_party class/function, we do not require this.
17
- llm: !new:inspiremusic.llm.plm.PLM
18
  text_encoder_input_size: !ref <text_encoder_input_size>
19
  llm_input_size: !ref <llm_input_size>
20
  llm_output_size: !ref <llm_output_size>
21
  audio_token_size: 4096
22
  length_normalized_loss: True
23
  lsm_weight: 0
24
- spk_embed_dim: !ref <spk_embed_dim>
25
  text_encoder_conf:
26
  name: "none"
27
  llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
@@ -36,11 +34,10 @@ llm: !new:inspiremusic.llm.plm.PLM
36
  win_size: 10
37
  tau_r: 0.1
38
  train_cfg_ratio: 0.2
39
- infer_cfg_ratio: 3.0
40
- flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
41
  input_size: 256
42
  output_size: 80
43
- spk_embed_dim: !ref <spk_embed_dim>
44
  output_type: 'mel'
45
  vocab_size: 4096
46
  input_frame_rate: 75
@@ -65,8 +62,6 @@ flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
65
  sampling_ratios: [1, 1, 1, 1]
66
  decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
67
  in_channels: 240
68
- n_spks: 1
69
- spk_emb_dim: 80
70
  cfm_params: !new:omegaconf.DictConfig
71
  content:
72
  sigma_min: 1e-06
@@ -85,6 +80,7 @@ flow: !new:inspiremusic.flow.flow.MaskedDiffWithXvec
85
  num_mid_blocks: 8
86
  num_heads: 8
87
  act_fn: 'gelu'
 
88
 
89
  hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
90
  in_channels: 80
@@ -147,9 +143,8 @@ sort: !name:inspiremusic.dataset.processor.sort
147
  sort_size: 500 # sort_size should be less than shuffle_size
148
  batch: !name:inspiremusic.dataset.processor.batch
149
  batch_type: 'dynamic'
150
- max_frames_in_batch: 30000
151
  padding: !name:inspiremusic.dataset.processor.padding
152
- use_spk_embedding: False # change to True during sft
153
 
154
  # dataset processor pipeline
155
  data_pipeline: [
@@ -162,6 +157,7 @@ data_pipeline: [
162
  !ref <padding>,
163
  ]
164
 
 
165
  # train conf
166
  train_conf:
167
  optim: adam
@@ -169,7 +165,7 @@ train_conf:
169
  lr: 0.00001 # change to 0.001 if you want to train flow from scratch
170
  scheduler: warmuplr
171
  scheduler_conf:
172
- warmup_steps: 1000
173
  max_epoch: 200
174
  grad_clip: 5
175
  accum_grad: 2
 
9
  text_encoder_input_size: 512
10
  llm_input_size: 896
11
  llm_output_size: 896
 
12
 
13
  # model params
14
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
15
  # for system/third_party class/function, we do not require this.
16
+ llm: !new:inspiremusic.llm.llm.LLM
17
  text_encoder_input_size: !ref <text_encoder_input_size>
18
  llm_input_size: !ref <llm_input_size>
19
  llm_output_size: !ref <llm_output_size>
20
  audio_token_size: 4096
21
  length_normalized_loss: True
22
  lsm_weight: 0
 
23
  text_encoder_conf:
24
  name: "none"
25
  llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
 
34
  win_size: 10
35
  tau_r: 0.1
36
  train_cfg_ratio: 0.2
37
+ infer_cfg_ratio: 7.0
38
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
39
  input_size: 256
40
  output_size: 80
 
41
  output_type: 'mel'
42
  vocab_size: 4096
43
  input_frame_rate: 75
 
62
  sampling_ratios: [1, 1, 1, 1]
63
  decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
64
  in_channels: 240
 
 
65
  cfm_params: !new:omegaconf.DictConfig
66
  content:
67
  sigma_min: 1e-06
 
80
  num_mid_blocks: 8
81
  num_heads: 8
82
  act_fn: 'gelu'
83
+ generator_model_dir: ../../pretrained_models/InspireMusic-Base/music_tokenizer
84
 
85
  hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
86
  in_channels: 80
 
143
  sort_size: 500 # sort_size should be less than shuffle_size
144
  batch: !name:inspiremusic.dataset.processor.batch
145
  batch_type: 'dynamic'
146
+ max_frames_in_batch: 10000
147
  padding: !name:inspiremusic.dataset.processor.padding
 
148
 
149
  # dataset processor pipeline
150
  data_pipeline: [
 
157
  !ref <padding>,
158
  ]
159
 
160
+
161
  # train conf
162
  train_conf:
163
  optim: adam
 
165
  lr: 0.00001 # change to 0.001 if you want to train flow from scratch
166
  scheduler: warmuplr
167
  scheduler_conf:
168
+ warmup_steps: 500
169
  max_epoch: 200
170
  grad_clip: 5
171
  accum_grad: 2