Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on 6 days ago

Commit

65a472d

1 Parent(s): 8f4f9ae

update

Browse files

Files changed (6) hide show

examples/nx_clean_unet/yaml/config.yaml +5 -5
toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py +16 -9
toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/__init__.py +0 -0
toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/attention.py +0 -0
toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/mask.py +0 -0
toolbox/torchaudio/models/nx_clean_unet/{transformer/transformer.py → transformers/transformers.py} +4 -6

examples/nx_clean_unet/yaml/config.yaml CHANGED Viewed

@@ -6,7 +6,7 @@ n_fft: 512
 win_size: 200
 hop_size: 80
-down_sampling_num_layers: 5
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
@@ -18,16 +18,16 @@ causal_kernel_size: 3
 causal_bias: false
 causal_separable: true
 causal_f_stride: 1
-causal_num_layers: 3
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
 tsfm_max_length: 512
-tsfm_chunk_size: 4
-tsfm_num_left_chunks: 64
-tsfm_num_right_chunks: 2
 discriminator_dim: 32
 discriminator_in_channel: 2

 win_size: 200
 hop_size: 80
+down_sampling_num_layers: 6
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 causal_bias: false
 causal_separable: true
 causal_f_stride: 1
+causal_num_layers: 5
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
 tsfm_max_length: 512
+tsfm_chunk_size: 1
+tsfm_num_left_chunks: 128
+tsfm_num_right_chunks: 4
 discriminator_dim: 32
 discriminator_in_channel: 2

toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import os
-from typing import Optional, Union
 import numpy as np
 import torch
@@ -10,7 +10,7 @@ from torch.nn import functional as F
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.nx_clean_unet.configuration_nx_clean_unet import NXCleanUNetConfig
-from toolbox.torchaudio.models.nx_clean_unet.transformer.transformer import TransformerEncoder
 from toolbox.torchaudio.models.nx_clean_unet.causal_convolution.causal_conv2d import CausalConv2dEncoder
@@ -66,10 +66,12 @@ class DownSampling(nn.Module):
     def forward(self, x: torch.Tensor):
         # x shape: [batch_size, channels, num_samples]
         for down_sampling_block in self.down_sampling_block_list:
             x = down_sampling_block.forward(x)
         # x shape: [batch_size, hidden_channels, num_samples**]
-        return x
 class UpSamplingBlock(nn.Module):
@@ -134,9 +136,14 @@ class UpSampling(nn.Module):
             up_sampling_block_list.append(up_sampling_block)
         self.up_sampling_block_list = nn.ModuleList(modules=up_sampling_block_list)
-    def forward(self, x: torch.Tensor):
         # x shape: [batch_size, channels, num_samples]
-        for up_sampling_block in self.up_sampling_block_list:
             x = up_sampling_block.forward(x)
         return x
@@ -209,7 +216,7 @@ class NXCleanUNet(nn.Module):
         )
         noisy_audios_padded = F.pad(input=noisy_audios, pad=(0, padded_length - n_samples), mode="constant", value=0)
-        bottle_neck = self.down_sampling.forward(noisy_audios_padded)
         # bottle_neck shape: [batch_size, channels, time_steps]
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
@@ -226,7 +233,7 @@ class NXCleanUNet(nn.Module):
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         # bottle_neck shape: [batch_size, channels, time_steps]
-        enhanced_audios = self.up_sampling.forward(bottle_neck)
         enhanced_audios = enhanced_audios[:, :, :n_samples]
         # enhanced_audios shape: [batch_size, 1, n_samples]
@@ -250,7 +257,7 @@ class NXCleanUNet(nn.Module):
         )
         noisy_audios_padded = F.pad(input=noisy_audios, pad=(0, padded_length - n_samples), mode="constant", value=0)
-        bottle_neck = self.down_sampling.forward(noisy_audios_padded)
         # bottle_neck shape: [batch_size, channels, time_steps]
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
@@ -267,7 +274,7 @@ class NXCleanUNet(nn.Module):
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         # bottle_neck shape: [batch_size, channels, time_steps]
-        enhanced_audios = self.up_sampling.forward(bottle_neck)
         enhanced_audios = enhanced_audios[:, :, :n_samples]
         # enhanced_audios shape: [batch_size, 1, n_samples]

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 import os
+from typing import List, Optional, Union
 import numpy as np
 import torch
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.nx_clean_unet.configuration_nx_clean_unet import NXCleanUNetConfig
+from toolbox.torchaudio.models.nx_clean_unet.transformers.transformers import TransformerEncoder
 from toolbox.torchaudio.models.nx_clean_unet.causal_convolution.causal_conv2d import CausalConv2dEncoder
     def forward(self, x: torch.Tensor):
         # x shape: [batch_size, channels, num_samples]
+        skip_connection_list = list()
         for down_sampling_block in self.down_sampling_block_list:
             x = down_sampling_block.forward(x)
+            skip_connection_list.append(x)
         # x shape: [batch_size, hidden_channels, num_samples**]
+        return x, skip_connection_list
 class UpSamplingBlock(nn.Module):
             up_sampling_block_list.append(up_sampling_block)
         self.up_sampling_block_list = nn.ModuleList(modules=up_sampling_block_list)
+    def forward(self, x: torch.Tensor, skip_connection_list: List[torch.Tensor]):
+        skip_connection_list = skip_connection_list[::-1]
         # x shape: [batch_size, channels, num_samples]
+        for idx, up_sampling_block in enumerate(self.up_sampling_block_list):
+            skip_x = skip_connection_list[idx]
+            x = x + skip_x
+            # x = x + skip_x[:, :, :x.shape[-1]]
             x = up_sampling_block.forward(x)
         return x
         )
         noisy_audios_padded = F.pad(input=noisy_audios, pad=(0, padded_length - n_samples), mode="constant", value=0)
+        bottle_neck, skip_connection_list = self.down_sampling.forward(noisy_audios_padded)
         # bottle_neck shape: [batch_size, channels, time_steps]
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         # bottle_neck shape: [batch_size, channels, time_steps]
+        enhanced_audios = self.up_sampling.forward(bottle_neck, skip_connection_list)
         enhanced_audios = enhanced_audios[:, :, :n_samples]
         # enhanced_audios shape: [batch_size, 1, n_samples]
         )
         noisy_audios_padded = F.pad(input=noisy_audios, pad=(0, padded_length - n_samples), mode="constant", value=0)
+        bottle_neck, skip_connection_list = self.down_sampling.forward(noisy_audios_padded)
         # bottle_neck shape: [batch_size, channels, time_steps]
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         bottle_neck = torch.transpose(bottle_neck, dim0=-2, dim1=-1)
         # bottle_neck shape: [batch_size, channels, time_steps]
+        enhanced_audios = self.up_sampling.forward(bottle_neck, skip_connection_list)
         enhanced_audios = enhanced_audios[:, :, :n_samples]
         # enhanced_audios shape: [batch_size, 1, n_samples]

toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/__init__.py RENAMED Viewed

File without changes

toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/attention.py RENAMED Viewed

File without changes

toolbox/torchaudio/models/nx_clean_unet/{transformer → transformers}/mask.py RENAMED Viewed

File without changes

toolbox/torchaudio/models/nx_clean_unet/{transformer/transformer.py → transformers/transformers.py} RENAMED Viewed

@@ -1,14 +1,12 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
-import math
 from typing import Dict, Optional, Tuple, List, Union
 import torch
 import torch.nn as nn
-from fontTools.subset import prune_post_subset
-from toolbox.torchaudio.models.nx_clean_unet.transformer.mask import subsequent_chunk_mask
-from toolbox.torchaudio.models.nx_clean_unet.transformer.attention import MultiHeadSelfAttention, RelativeMultiHeadSelfAttention
 class PositionwiseFeedForward(nn.Module):
@@ -41,7 +39,7 @@ class PositionwiseFeedForward(nn.Module):
         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
-class TransformerEncoderLayer(nn.Module):
     def __init__(self,
                  input_dim: int,
                  dropout_rate: float = 0.1,
@@ -129,7 +127,7 @@ class TransformerEncoder(nn.Module):
         )
         self.encoder_layer_list = torch.nn.ModuleList([
-            TransformerEncoderLayer(
                 input_dim=hidden_size,
                 n_heads=attention_heads,
                 dropout_rate=dropout_rate,

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 from typing import Dict, Optional, Tuple, List, Union
 import torch
 import torch.nn as nn
+from toolbox.torchaudio.models.nx_clean_unet.transformers.mask import subsequent_chunk_mask
+from toolbox.torchaudio.models.nx_clean_unet.transformers.attention import MultiHeadSelfAttention, RelativeMultiHeadSelfAttention
 class PositionwiseFeedForward(nn.Module):
         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class TransformerBlock(nn.Module):
     def __init__(self,
                  input_dim: int,
                  dropout_rate: float = 0.1,
         )
         self.encoder_layer_list = torch.nn.ModuleList([
+            TransformerBlock(
                 input_dim=hidden_size,
                 n_heads=attention_heads,
                 dropout_rate=dropout_rate,