jamierpond
/

xcodec_mini_infer

Model card Files Files and versions Community

jamierpond commited on 14 days ago

Commit

a721832

1 Parent(s): 4fb705d

update pycache and model paths

Browse files

Files changed (2) hide show

.gitignore +1 -0
models/soundstream_hubert_new.py +28 -24

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

models/soundstream_hubert_new.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from typing import Sequence, Optional, Union
 import sys
@@ -28,19 +28,19 @@ import descriptaudiocodec.dac.model.dac  as dac2
 def get_model_size(model):
     # 计算总参数数
     total_params = sum(p.numel() for p in model.parameters())
     # 假设每个参数都是32位浮点数，计算模型大小（以字节为单位）
     model_size_bytes = total_params    # 每个参数4字节
     # 转换为更易读的单位（例如，MB）
     model_size_mb = model_size_bytes / (1024 ** 2)
     return total_params, model_size_mb
 class SoundStream(nn.Module):
     """ SoundStream model or EnCodec model.
     Args:
         n_filters (int): n_filters (int): Base width for the model.
         D (int): Intermediate representation dimension.
@@ -82,7 +82,7 @@ class SoundStream(nn.Module):
         # out_D=D+768
         self.quantizer = ResidualVectorQuantizer(dimension=D+768, n_q=n_q, bins=bins)
         # Decoder model
         # self.decoder = SEANetDecoder(n_filters= n_filters, dimension=D, ratios=ratios, causal=causal)
         self.decoder_2 = dac2.Decoder(            D,1024,ratios,)
@@ -92,19 +92,23 @@ class SoundStream(nn.Module):
         # )#.to(self.args.device)
         # self.upstream.model = self.upstream.model.to(self.device)
         c=1
-        # self.upstream(wavs)
         # self.processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
-        self.is_semantic= True
         if self.is_semantic:
-            # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/DiT_TTS/ckpts/yz_2")
             # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/fairseq/outputs/2024-05-11/13-27-56/hf15")
-            self.semantic_model = AutoModel.from_pretrained("./xcodec_mini_infer/semantic_ckpts/hf_1_325000")
             self.semantic_model.eval()
             # self.transform_linear = nn.Linear(1024, 768)
         # processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
         # self.semantic_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
         self.fc_prior = nn.Linear(D+768, D+768 )
@@ -114,9 +118,9 @@ class SoundStream(nn.Module):
     def get_last_layer(self):
         return self.decoder.layers[-1].weight
-    def calculate_rec_loss(self, rec, target):
         target = target / target.norm(dim=-1, keepdim=True)
         rec = rec / rec.norm(dim=-1, keepdim=True)
         rec_loss = (1 - (target * rec).sum(-1)).mean()
@@ -131,32 +135,32 @@ class SoundStream(nn.Module):
         x = F.pad(x, (160, 160))
         target = self.semantic_model(x, output_hidden_states=True) .hidden_states
         target = torch.stack(target, dim=1)#.transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
-        target = target.mean(1)
         # target = target[9]
         return target
     def forward(self, x: torch.Tensor, bw: int):
         e_semantic_input = self.get_regress_target_whisper(x).detach()
         e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
         e_acoustic = self.encoder(x)
         e= torch.cat([e_acoustic, e_semantic], dim=1)
         e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)
         quantized, codes, bandwidth, commit_loss  = self.quantizer(e, self.frame_rate, bw)
         quantized_semantic = self.fc_post1(quantized.transpose(1, 2)).transpose(1, 2)
         quantized_acoustic = self.fc_post2(quantized.transpose(1, 2)).transpose(1, 2)
         o = self.decoder_2(quantized_acoustic)
         o_semantic = self.decoder_semantic(quantized_semantic )
         semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(),o_semantic)
@@ -171,7 +175,7 @@ class SoundStream(nn.Module):
         bw = target_bw
         # codes = self.quantizer.encode(e, self.frame_rate, bw)
         # if e_acoustic.shape[2] != e_semantic.shape[2]:
         #     print(f"e_acoustic {e_acoustic.shape} e_semantic{e_semantic.shape}")
@@ -182,9 +186,9 @@ class SoundStream(nn.Module):
         if e_acoustic.shape[2] != e_semantic.shape[2]:
-            # e_acoustic = self.encoder(F.pad(x[:,0,:], (160, 160)).unsqueeze(0))
             e_acoustic = self.encoder(torch.transpose(F.pad(x[:,0,:], (160, 160)).unsqueeze(0), 0, 1))
         e= torch.cat([e_acoustic, e_semantic], dim=1)
         e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)

 from typing import Sequence, Optional, Union
 import sys
 def get_model_size(model):
     # 计算总参数数
     total_params = sum(p.numel() for p in model.parameters())
     # 假设每个参数都是32位浮点数，计算模型大小（以字节为单位）
     model_size_bytes = total_params    # 每个参数4字节
     # 转换为更易读的单位（例如，MB）
     model_size_mb = model_size_bytes / (1024 ** 2)
     return total_params, model_size_mb
 class SoundStream(nn.Module):
     """ SoundStream model or EnCodec model.
     Args:
         n_filters (int): n_filters (int): Base width for the model.
         D (int): Intermediate representation dimension.
         # out_D=D+768
         self.quantizer = ResidualVectorQuantizer(dimension=D+768, n_q=n_q, bins=bins)
         # Decoder model
         # self.decoder = SEANetDecoder(n_filters= n_filters, dimension=D, ratios=ratios, causal=causal)
         self.decoder_2 = dac2.Decoder(            D,1024,ratios,)
         # )#.to(self.args.device)
         # self.upstream.model = self.upstream.model.to(self.device)
         c=1
+        # self.upstream(wavs)
         # self.processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        self.is_semantic= True
         if self.is_semantic:
+            # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/DiT_TTS/ckpts/yz_2")
             # self.semantic_model = AutoModel.from_pretrained("/aifs4su/data/zheny/fairseq/outputs/2024-05-11/13-27-56/hf15")
+            import os
+            this_dir = os.path.dirname(os.path.abspath(__file__)) # models
+            parent_dir = os.path.dirname(this_dir) # xcodec_mini_infer
+            model_dir = os.path.join(parent_dir, 'semantic_ckpts/hf_1_325000')
+            self.semantic_model = AutoModel.from_pretrained(model_dir)
             self.semantic_model.eval()
             # self.transform_linear = nn.Linear(1024, 768)
         # processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
         # self.semantic_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
         self.fc_prior = nn.Linear(D+768, D+768 )
     def get_last_layer(self):
         return self.decoder.layers[-1].weight
+    def calculate_rec_loss(self, rec, target):
         target = target / target.norm(dim=-1, keepdim=True)
         rec = rec / rec.norm(dim=-1, keepdim=True)
         rec_loss = (1 - (target * rec).sum(-1)).mean()
         x = F.pad(x, (160, 160))
         target = self.semantic_model(x, output_hidden_states=True) .hidden_states
         target = torch.stack(target, dim=1)#.transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
+        target = target.mean(1)
         # target = target[9]
         return target
     def forward(self, x: torch.Tensor, bw: int):
         e_semantic_input = self.get_regress_target_whisper(x).detach()
         e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
         e_acoustic = self.encoder(x)
         e= torch.cat([e_acoustic, e_semantic], dim=1)
         e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)
         quantized, codes, bandwidth, commit_loss  = self.quantizer(e, self.frame_rate, bw)
         quantized_semantic = self.fc_post1(quantized.transpose(1, 2)).transpose(1, 2)
         quantized_acoustic = self.fc_post2(quantized.transpose(1, 2)).transpose(1, 2)
         o = self.decoder_2(quantized_acoustic)
         o_semantic = self.decoder_semantic(quantized_semantic )
         semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(),o_semantic)
         bw = target_bw
         # codes = self.quantizer.encode(e, self.frame_rate, bw)
         # if e_acoustic.shape[2] != e_semantic.shape[2]:
         #     print(f"e_acoustic {e_acoustic.shape} e_semantic{e_semantic.shape}")
         if e_acoustic.shape[2] != e_semantic.shape[2]:
+            # e_acoustic = self.encoder(F.pad(x[:,0,:], (160, 160)).unsqueeze(0))
             e_acoustic = self.encoder(torch.transpose(F.pad(x[:,0,:], (160, 160)).unsqueeze(0), 0, 1))
         e= torch.cat([e_acoustic, e_semantic], dim=1)
         e = self.fc_prior(e.transpose(1, 2)).transpose(1, 2)