Chong Zhang commited on
Commit
3bdbe06
·
verified ·
1 Parent(s): 1b65ad1
Files changed (1) hide show
  1. inspiremusic.yaml +24 -17
inspiremusic.yaml CHANGED
@@ -6,10 +6,14 @@ __set_seed4: !apply:torch.cuda.manual_seed_all [1024]
6
 
7
  # fixed params
8
  sample_rate: 24000
 
9
  text_encoder_input_size: 512
10
  llm_input_size: 896
11
  llm_output_size: 896
12
 
 
 
 
13
  # model params
14
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
15
  # for system/third_party class/function, we do not require this.
@@ -24,17 +28,12 @@ llm: !new:inspiremusic.llm.llm.LLM
24
  name: "none"
25
  llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
26
  input_size: !ref <text_encoder_input_size>
27
- pretrain_path: ../../pretrained_models/InspireMusic-Base/
28
 
29
- # sampling: !name:inspiremusic.utils.common.topk_sampling
30
- # top_k: 350
31
- sampling: !name:inspiremusic.utils.common.ras_sampling
32
- top_p: 0.8
33
- top_k: 50
34
- win_size: 10
35
- tau_r: 0.1
36
  train_cfg_ratio: 0.2
37
- infer_cfg_ratio: 7.0
38
  flow: !new:inspiremusic.flow.flow.MaskedDiff
39
  input_size: 256
40
  output_size: 80
@@ -80,7 +79,7 @@ flow: !new:inspiremusic.flow.flow.MaskedDiff
80
  num_mid_blocks: 8
81
  num_heads: 8
82
  act_fn: 'gelu'
83
- generator_model_dir: ../../pretrained_models/InspireMusic-Base/music_tokenizer
84
 
85
  hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
86
  in_channels: 80
@@ -111,19 +110,24 @@ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
111
  # processor functions
112
  parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
113
  get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
114
- tokenizer_path: "../../pretrained_models/InspireMusic-Base/"
115
  tokenizer_name: "qwen-2.0"
116
  allowed_special: 'all'
117
  tokenize: !name:inspiremusic.dataset.processor.tokenize
118
  get_tokenizer: !ref <get_tokenizer>
119
  allowed_special: !ref <allowed_special>
120
  filter: !name:inspiremusic.dataset.processor.filter
121
- max_length: 28000
122
- min_length: 0
123
  token_max_length: 200
124
  token_min_length: 1
 
 
 
 
125
  resample: !name:inspiremusic.dataset.processor.resample
126
  resample_rate: !ref <sample_rate>
 
127
  feat_extractor: !name:matcha.utils.audio.mel_spectrogram
128
  n_fft: 1024
129
  num_mels: 128
@@ -131,7 +135,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
131
  hop_size: 256
132
  win_size: 1024
133
  fmin: 0
134
- fmax: 12000
135
  center: False
136
  compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
137
  feat_extractor: !ref <feat_extractor>
@@ -143,8 +147,11 @@ sort: !name:inspiremusic.dataset.processor.sort
143
  sort_size: 500 # sort_size should be less than shuffle_size
144
  batch: !name:inspiremusic.dataset.processor.batch
145
  batch_type: 'dynamic'
146
- max_frames_in_batch: 10000
 
 
147
  padding: !name:inspiremusic.dataset.processor.padding
 
148
 
149
  # dataset processor pipeline
150
  data_pipeline: [
@@ -162,7 +169,7 @@ data_pipeline: [
162
  train_conf:
163
  optim: adam
164
  optim_conf:
165
- lr: 0.00001 # change to 0.001 if you want to train flow from scratch
166
  scheduler: warmuplr
167
  scheduler_conf:
168
  warmup_steps: 500
@@ -170,4 +177,4 @@ train_conf:
170
  grad_clip: 5
171
  accum_grad: 2
172
  log_interval: 100
173
- save_per_step: 10000
 
6
 
7
  # fixed params
8
  sample_rate: 24000
9
+ target_sample_rate: 48000
10
  text_encoder_input_size: 512
11
  llm_input_size: 896
12
  llm_output_size: 896
13
 
14
+ basemodel_path: '../../pretrained_models/InspireMusic-Base/'
15
+ generator_path: '../../pretrained_models/InspireMusic-Base/music_tokenizer'
16
+
17
  # model params
18
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
19
  # for system/third_party class/function, we do not require this.
 
28
  name: "none"
29
  llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
30
  input_size: !ref <text_encoder_input_size>
31
+ pretrain_path: !ref <basemodel_path>
32
 
33
+ sampling: !name:inspiremusic.utils.common.topk_sampling
34
+ top_k: 350
 
 
 
 
 
35
  train_cfg_ratio: 0.2
36
+ infer_cfg_ratio: 3.0
37
  flow: !new:inspiremusic.flow.flow.MaskedDiff
38
  input_size: 256
39
  output_size: 80
 
79
  num_mid_blocks: 8
80
  num_heads: 8
81
  act_fn: 'gelu'
82
+ generator_model_dir: !ref <generator_path>
83
 
84
  hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
85
  in_channels: 80
 
110
  # processor functions
111
  parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
112
  get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
113
+ tokenizer_path: !ref <basemodel_path>
114
  tokenizer_name: "qwen-2.0"
115
  allowed_special: 'all'
116
  tokenize: !name:inspiremusic.dataset.processor.tokenize
117
  get_tokenizer: !ref <get_tokenizer>
118
  allowed_special: !ref <allowed_special>
119
  filter: !name:inspiremusic.dataset.processor.filter
120
+ max_length: 20000
121
+ min_length: 1
122
  token_max_length: 200
123
  token_min_length: 1
124
+ max_acoustic_length: 20000
125
+ min_acoustic_length: 1800
126
+ mode: 'train_flow'
127
+
128
  resample: !name:inspiremusic.dataset.processor.resample
129
  resample_rate: !ref <sample_rate>
130
+
131
  feat_extractor: !name:matcha.utils.audio.mel_spectrogram
132
  n_fft: 1024
133
  num_mels: 128
 
135
  hop_size: 256
136
  win_size: 1024
137
  fmin: 0
138
+ fmax: 24000
139
  center: False
140
  compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
141
  feat_extractor: !ref <feat_extractor>
 
147
  sort_size: 500 # sort_size should be less than shuffle_size
148
  batch: !name:inspiremusic.dataset.processor.batch
149
  batch_type: 'dynamic'
150
+ max_frames_in_batch: 15500 # llm 12000
151
+ # batch_type: 'static'
152
+ # batch_size: 2 # llm 12000
153
  padding: !name:inspiremusic.dataset.processor.padding
154
+ mode: 'train'
155
 
156
  # dataset processor pipeline
157
  data_pipeline: [
 
169
  train_conf:
170
  optim: adam
171
  optim_conf:
172
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
173
  scheduler: warmuplr
174
  scheduler_conf:
175
  warmup_steps: 500
 
177
  grad_clip: 5
178
  accum_grad: 2
179
  log_interval: 100
180
+ save_per_step: 500