Spaces:
Running
Running
update
Browse files
examples/nx_denoise/run.sh
CHANGED
@@ -3,27 +3,11 @@
|
|
3 |
: <<'END'
|
4 |
|
5 |
|
6 |
-
sh run.sh --stage 2 --stop_stage 2 --system_version
|
7 |
-
--noise_dir "E:/Users/tianx/HuggingDatasets/nx_noise/data/noise" \
|
8 |
-
--speech_dir "E:/programmer/asr_datasets/aishell/data_aishell/wav/train"
|
9 |
-
|
10 |
-
|
11 |
-
sh run.sh --stage 3 --stop_stage 3 --system_version centos --file_folder_name file_dir --final_model_name mpnet-aishell-20250224 \
|
12 |
-
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
|
13 |
-
--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train"
|
14 |
-
|
15 |
-
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-clean-unet-aishell-20250228 \
|
16 |
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
|
17 |
--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
|
18 |
--max_epochs 100
|
19 |
|
20 |
-
|
21 |
-
sh run.sh --stage 1 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name mpnet-nx-speech-20250224 \
|
22 |
-
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
|
23 |
-
--speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech" \
|
24 |
-
--max_epochs 100 --max_count 10000
|
25 |
-
|
26 |
-
|
27 |
END
|
28 |
|
29 |
|
|
|
3 |
: <<'END'
|
4 |
|
5 |
|
6 |
+
sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-denoise-aishell-20250228 \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
|
8 |
--speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
|
9 |
--max_epochs 100
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
END
|
12 |
|
13 |
|
examples/nx_denoise/yaml/config.yaml
CHANGED
@@ -5,16 +5,7 @@ segment_size: 16000
|
|
5 |
n_fft: 512
|
6 |
win_size: 200
|
7 |
hop_size: 80
|
8 |
-
# 因为 hop_size 取 80,则相当于 stft 的时间步是 10ms 一步,所以降采样也考虑到差不多的分辨率。
|
9 |
|
10 |
-
# 2**down_sampling_num_layers,
|
11 |
-
# 例如 2**6=64 就意味着 64 个值在降采样之后是一个时间步,
|
12 |
-
# 则一步是 64/sample_rate = 0.008秒。
|
13 |
-
# 那么 tsfm_chunk_size=2 则为16ms,tsfm_chunk_size=4 则为32ms
|
14 |
-
# 假设每次向左看1秒,向右看30ms,则:
|
15 |
-
# tsfm_chunk_size=1,tsfm_num_left_chunks=128,tsfm_num_right_chunks=4
|
16 |
-
# tsfm_chunk_size=2,tsfm_num_left_chunks=64,tsfm_num_right_chunks=2
|
17 |
-
# tsfm_chunk_size=4,tsfm_num_left_chunks=32,tsfm_num_right_chunks=1
|
18 |
down_sampling_num_layers: 6
|
19 |
down_sampling_in_channels: 1
|
20 |
down_sampling_hidden_channels: 64
|
|
|
5 |
n_fft: 512
|
6 |
win_size: 200
|
7 |
hop_size: 80
|
|
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
down_sampling_num_layers: 6
|
10 |
down_sampling_in_channels: 1
|
11 |
down_sampling_hidden_channels: 64
|
toolbox/torchaudio/models/nx_denoise/modeling_nx_denoise.py
CHANGED
@@ -244,6 +244,8 @@ class NXDenoise(nn.Module):
|
|
244 |
|
245 |
# ts transformer
|
246 |
# bottle_neck shape: [batch_size, channels, time_steps, freq_dim]
|
|
|
|
|
247 |
|
248 |
# causal conv out
|
249 |
bottle_neck = self.causal_conv_out.forward(bottle_neck)
|
|
|
244 |
|
245 |
# ts transformer
|
246 |
# bottle_neck shape: [batch_size, channels, time_steps, freq_dim]
|
247 |
+
bottle_neck = self.ts_transformer.forward(bottle_neck)
|
248 |
+
# bottle_neck shape: [batch_size, channels, time_steps, freq_dim]
|
249 |
|
250 |
# causal conv out
|
251 |
bottle_neck = self.causal_conv_out.forward(bottle_neck)
|