jamierpond commited on
Commit
a721832
·
1 Parent(s): 4fb705d

update pycache and model paths

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. models/soundstream_hubert_new.py +28 -24
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **__pycache__**
models/soundstream_hubert_new.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
 
3
  from typing import Sequence, Optional, Union
4
  import sys
@@ -28,19 +28,19 @@ import descriptaudiocodec.dac.model.dac as dac2
28
  def get_model_size(model):
29
  # 计算总参数数
30
  total_params = sum(p.numel() for p in model.parameters())
31
-
32
  # 假设每个参数都是32位浮点数,计算模型大小(以字节为单位)
33
  model_size_bytes = total_params # 每个参数4字节
34
-
35
  # 转换为更易读的单位(例如,MB)
36
  model_size_mb = model_size_bytes / (1024 ** 2)
37
-
38
  return total_params, model_size_mb
39
 
40
 
41
  class SoundStream(nn.Module):
42
  """ SoundStream model or EnCodec model.
43
-
44
  Args:
45
  n_filters (int): n_filters (int): Base width for the model.
46
  D (int): Intermediate representation dimension.
@@ -82,7 +82,7 @@ class SoundStream(nn.Module):
82
  # out_D=D+768
83
  self.quantizer = ResidualVectorQuantizer(dimension=D+768, n_q=n_q, bins=bins)
84
  # Decoder model
85
-
86
  # self.decoder = SEANetDecoder(n_filters= n_filters, dimension=D, ratios=ratios, causal=causal)
87
  self.decoder_2 = dac2.Decoder( D,1024,ratios,)
88
 
@@ -92,19 +92,23 @@ class SoundStream(nn.Module):
92
  # )#.to(self.args.device)
93
  # self.upstream.model = self.upstream.model.to(self.device)
94
  c=1
95
- # self.upstream(wavs)
96
  # self.processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
97
 
98
- self.is_semantic= True
99
  if self.is_semantic:
100
- # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/DiT_TTS/ckpts/yz_2")
101
  # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/fairseq/outputs/2024-05-11/13-27-56/hf15")
102
- self.semantic_model = AutoModel.from_pretrained("./xcodec_mini_infer/semantic_ckpts/hf_1_325000")
 
 
 
 
103
  self.semantic_model.eval()
104
  # self.transform_linear = nn.Linear(1024, 768)
105
 
106
 
107
-
108
  # processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
109
  # self.semantic_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
110
  self.fc_prior = nn.Linear(D+768, D+768 )
@@ -114,9 +118,9 @@ class SoundStream(nn.Module):
114
 
115
  def get_last_layer(self):
116
  return self.decoder.layers[-1].weight
117
-
118
- def calculate_rec_loss(self, rec, target):
119
-
120
  target = target / target.norm(dim=-1, keepdim=True)
121
  rec = rec / rec.norm(dim=-1, keepdim=True)
122
  rec_loss = (1 - (target * rec).sum(-1)).mean()
@@ -131,32 +135,32 @@ class SoundStream(nn.Module):
131
  x = F.pad(x, (160, 160))
132
  target = self.semantic_model(x, output_hidden_states=True) .hidden_states
133
  target = torch.stack(target, dim=1)#.transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
134
-
135
- target = target.mean(1)
136
  # target = target[9]
137
  return target
138
 
139
-
140
  def forward(self, x: torch.Tensor, bw: int):
141
 
142
  e_semantic_input = self.get_regress_target_whisper(x).detach()
143
 
144
  e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
145
  e_acoustic = self.encoder(x)
146
-
147
-
148
  e= torch.cat([e_acoustic, e_semantic], dim=1)
149
 
150
  e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)
151
 
152
-
153
  quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
154
 
155
  quantized_semantic = self.fc_post1(quantized.transpose(1, 2)).transpose(1, 2)
156
  quantized_acoustic = self.fc_post2(quantized.transpose(1, 2)).transpose(1, 2)
157
 
158
  o = self.decoder_2(quantized_acoustic)
159
-
160
  o_semantic = self.decoder_semantic(quantized_semantic )
161
  semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(),o_semantic)
162
 
@@ -171,7 +175,7 @@ class SoundStream(nn.Module):
171
  bw = target_bw
172
  # codes = self.quantizer.encode(e, self.frame_rate, bw)
173
 
174
-
175
  # if e_acoustic.shape[2] != e_semantic.shape[2]:
176
  # print(f"e_acoustic {e_acoustic.shape} e_semantic{e_semantic.shape}")
177
 
@@ -182,9 +186,9 @@ class SoundStream(nn.Module):
182
 
183
 
184
  if e_acoustic.shape[2] != e_semantic.shape[2]:
185
- # e_acoustic = self.encoder(F.pad(x[:,0,:], (160, 160)).unsqueeze(0))
186
  e_acoustic = self.encoder(torch.transpose(F.pad(x[:,0,:], (160, 160)).unsqueeze(0), 0, 1))
187
-
188
  e= torch.cat([e_acoustic, e_semantic], dim=1)
189
 
190
  e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)
 
1
+
2
 
3
  from typing import Sequence, Optional, Union
4
  import sys
 
28
  def get_model_size(model):
29
  # 计算总参数数
30
  total_params = sum(p.numel() for p in model.parameters())
31
+
32
  # 假设每个参数都是32位浮点数,计算模型大小(以字节为单位)
33
  model_size_bytes = total_params # 每个参数4字节
34
+
35
  # 转换为更易读的单位(例如,MB)
36
  model_size_mb = model_size_bytes / (1024 ** 2)
37
+
38
  return total_params, model_size_mb
39
 
40
 
41
  class SoundStream(nn.Module):
42
  """ SoundStream model or EnCodec model.
43
+
44
  Args:
45
  n_filters (int): n_filters (int): Base width for the model.
46
  D (int): Intermediate representation dimension.
 
82
  # out_D=D+768
83
  self.quantizer = ResidualVectorQuantizer(dimension=D+768, n_q=n_q, bins=bins)
84
  # Decoder model
85
+
86
  # self.decoder = SEANetDecoder(n_filters= n_filters, dimension=D, ratios=ratios, causal=causal)
87
  self.decoder_2 = dac2.Decoder( D,1024,ratios,)
88
 
 
92
  # )#.to(self.args.device)
93
  # self.upstream.model = self.upstream.model.to(self.device)
94
  c=1
95
+ # self.upstream(wavs)
96
  # self.processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
97
 
98
+ self.is_semantic= True
99
  if self.is_semantic:
100
+ # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/DiT_TTS/ckpts/yz_2")
101
  # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/fairseq/outputs/2024-05-11/13-27-56/hf15")
102
+ import os
103
+ this_dir = os.path.dirname(os.path.abspath(__file__)) # models
104
+ parent_dir = os.path.dirname(this_dir) # xcodec_mini_infer
105
+ model_dir = os.path.join(parent_dir, 'semantic_ckpts/hf_1_325000')
106
+ self.semantic_model = AutoModel.from_pretrained(model_dir)
107
  self.semantic_model.eval()
108
  # self.transform_linear = nn.Linear(1024, 768)
109
 
110
 
111
+
112
  # processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
113
  # self.semantic_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
114
  self.fc_prior = nn.Linear(D+768, D+768 )
 
118
 
119
  def get_last_layer(self):
120
  return self.decoder.layers[-1].weight
121
+
122
+ def calculate_rec_loss(self, rec, target):
123
+
124
  target = target / target.norm(dim=-1, keepdim=True)
125
  rec = rec / rec.norm(dim=-1, keepdim=True)
126
  rec_loss = (1 - (target * rec).sum(-1)).mean()
 
135
  x = F.pad(x, (160, 160))
136
  target = self.semantic_model(x, output_hidden_states=True) .hidden_states
137
  target = torch.stack(target, dim=1)#.transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
138
+
139
+ target = target.mean(1)
140
  # target = target[9]
141
  return target
142
 
143
+
144
  def forward(self, x: torch.Tensor, bw: int):
145
 
146
  e_semantic_input = self.get_regress_target_whisper(x).detach()
147
 
148
  e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
149
  e_acoustic = self.encoder(x)
150
+
151
+
152
  e= torch.cat([e_acoustic, e_semantic], dim=1)
153
 
154
  e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)
155
 
156
+
157
  quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
158
 
159
  quantized_semantic = self.fc_post1(quantized.transpose(1, 2)).transpose(1, 2)
160
  quantized_acoustic = self.fc_post2(quantized.transpose(1, 2)).transpose(1, 2)
161
 
162
  o = self.decoder_2(quantized_acoustic)
163
+
164
  o_semantic = self.decoder_semantic(quantized_semantic )
165
  semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(),o_semantic)
166
 
 
175
  bw = target_bw
176
  # codes = self.quantizer.encode(e, self.frame_rate, bw)
177
 
178
+
179
  # if e_acoustic.shape[2] != e_semantic.shape[2]:
180
  # print(f"e_acoustic {e_acoustic.shape} e_semantic{e_semantic.shape}")
181
 
 
186
 
187
 
188
  if e_acoustic.shape[2] != e_semantic.shape[2]:
189
+ # e_acoustic = self.encoder(F.pad(x[:,0,:], (160, 160)).unsqueeze(0))
190
  e_acoustic = self.encoder(torch.transpose(F.pad(x[:,0,:], (160, 160)).unsqueeze(0), 0, 1))
191
+
192
  e= torch.cat([e_acoustic, e_semantic], dim=1)
193
 
194
  e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)