File size: 5,400 Bytes
4efe6b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec44fe4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import sys
import time
import librosa
import numpy as np
from scipy import signal
from scipy.io import wavfile
from multiprocessing import cpu_count, Pool
from pydub import AudioSegment

now_directory = os.getcwd()
sys.path.append(now_directory)

from rvc.lib.utils import load_audio
from rvc.train.slicer import Slicer

# Parse command line arguments
experiment_directory = str(sys.argv[1])
input_root = str(sys.argv[2])
sample_rate = int(sys.argv[3])
percentage = float(sys.argv[4])
num_processes = int(sys.argv[5]) if len(sys.argv) > 5 else cpu_count()

# Define constants
OVERLAP = 0.3
TAIL = percentage + OVERLAP
MAX_AMPLITUDE = 0.9
ALPHA = 0.75
HIGH_PASS_CUTOFF = 48
SAMPLE_RATE_16K = 16000

# Define directory paths
GT_WAVS_DIR = os.path.join(experiment_directory, "sliced_audios")
WAVS16K_DIR = os.path.join(experiment_directory, "sliced_audios_16k")


class PreProcess:
    def __init__(self, sr: int, exp_dir: str, per: float):
        self.slicer = Slicer(
            sr=sr,
            threshold=-42,
            min_length=1500,
            min_interval=400,
            hop_size=15,
            max_sil_kept=500,
        )
        self.sr = sr
        self.b_high, self.a_high = signal.butter(
            N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr
        )
        self.per = per
        self.exp_dir = exp_dir

    def _normalize_audio(self, audio: np.ndarray):
        """Normalizes the audio to the desired amplitude."""
        tmp_max = np.abs(audio).max()
        if tmp_max > 2.5:
            return None  # Indicate audio should be filtered out
        return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio

    def _write_audio(self, audio: np.ndarray, filename: str, sr: int):
        """Writes the audio to a WAV file."""
        wavfile.write(filename, sr, audio.astype(np.float32))

    def process_audio_segment(self, audio_segment: np.ndarray, idx0: int, idx1: int):
        """Processes a single audio segment."""
        normalized_audio = self._normalize_audio(audio_segment)
        if normalized_audio is None:
            print(f"{idx0}-{idx1}-filtered")
            return

        # Write original sample rate audio
        gt_wav_path = os.path.join(GT_WAVS_DIR, f"{idx0}_{idx1}.wav")
        self._write_audio(normalized_audio, gt_wav_path, self.sr)

        # Resample and write 16kHz audio
        audio_16k = librosa.resample(
            normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K
        )
        wav_16k_path = os.path.join(WAVS16K_DIR, f"{idx0}_{idx1}.wav")
        self._write_audio(audio_16k, wav_16k_path, SAMPLE_RATE_16K)

    def process_audio(self, path: str, idx0: int):
        """Processes a single audio file."""
        try:
            audio = load_audio(path, self.sr)
            audio = signal.lfilter(self.b_high, self.a_high, audio)

            idx1 = 0
            for audio_segment in self.slicer.slice(audio):
                i = 0
                while True:
                    start = int(self.sr * (self.per - OVERLAP) * i)
                    i += 1
                    if len(audio_segment[start:]) > TAIL * self.sr:
                        tmp_audio = audio_segment[
                            start : start + int(self.per * self.sr)
                        ]
                        self.process_audio_segment(tmp_audio, idx0, idx1)
                        idx1 += 1
                    else:
                        tmp_audio = audio_segment[start:]
                        self.process_audio_segment(tmp_audio, idx0, idx1)
                        idx1 += 1
                        break
        except Exception as error:
            print(f"An error occurred on {path} path: {error}")

    def process_audio_file(self, file_path_idx):
        file_path, idx0 = file_path_idx
        # Convert the audio file to WAV format using pydub if necessary
        ext = os.path.splitext(file_path)[1].lower()
        if ext not in [".wav"]:
            audio = AudioSegment.from_file(file_path)
            file_path = os.path.join("/tmp", f"{idx0}.wav")
            audio.export(file_path, format="wav")
        self.process_audio(file_path, idx0)

    def process_audio_multiprocessing_input_directory(
        self, input_root: str, num_processes: int
    ):
        # Get list of files
        files = [
            (os.path.join(input_root, f), idx)
            for idx, f in enumerate(os.listdir(input_root))
            if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg"))
        ]

        # Create the directories if they don't exist
        os.makedirs(GT_WAVS_DIR, exist_ok=True)
        os.makedirs(WAVS16K_DIR, exist_ok=True)

        # Use multiprocessing to process files
        with Pool(processes=num_processes) as pool:
            pool.map(self.process_audio_file, files)


def preprocess_training_set(
    input_root: str, sr: int, num_processes: int, exp_dir: str, per: float
):
    start_time = time.time()
    pp = PreProcess(sr, exp_dir, per)
    print(f"Starting preprocess with {num_processes} cores...")
    pp.process_audio_multiprocessing_input_directory(input_root, num_processes)
    elapsed_time = time.time() - start_time
    print(f"Preprocess completed in {elapsed_time:.2f} seconds.")


if __name__ == "__main__":
    preprocess_training_set(
        input_root, sample_rate, num_processes, experiment_directory, percentage
    )