Hecheng0625's picture
Upload 409 files
c968fc3 verified
raw
history blame
11.4 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import json
from tqdm import tqdm
import os
import torchaudio
import torch
from utils.mfa_prepare import (
process_wav_files,
get_wav_files,
filter_wav_files_by_length,
)
from utils.cut_by_vad import cut_segments
from utils.whisper_transcription import asr_main
from utils.util import has_existed
import subprocess
import random
from collections import defaultdict
from glob import glob
import shutil
def librilight_statistics(data_dir):
"""Get statistics for librilight dataset"""
distribution2speakers2utts = defaultdict(lambda: defaultdict(list))
distribution_infos = glob(data_dir + "/*")
for distribution_info in distribution_infos:
distribution = distribution_info.split("/")[-1]
print(distribution)
speaker_infos = glob(distribution_info + "/*")
if len(speaker_infos) == 0:
continue
for speaker_info in speaker_infos:
speaker = speaker_info.split("/")[-1]
utts = glob(speaker_info + "/*.wav")
for utt in utts:
uid = utt.split("/")[-1].split(".")[0]
distribution2speakers2utts[distribution][speaker].append(uid)
return distribution2speakers2utts
def get_speakers_from_directory(directory):
return [
d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))
]
def split_dataset_by_speaker(base_dir, train_ratio=0.8, dev_ratio=0.1):
train_dir = os.path.join(base_dir, "train")
dev_dir = os.path.join(base_dir, "dev")
eval_dir = os.path.join(base_dir, "eval")
# Check if dataset is already split
if has_existed(train_dir) or has_existed(dev_dir) or has_existed(eval_dir):
print("Dataset already split. Calculating speakers...")
train_speakers = get_speakers_from_directory(train_dir)
dev_speakers = get_speakers_from_directory(dev_dir)
eval_speakers = get_speakers_from_directory(eval_dir)
all_speakers = train_speakers + dev_speakers + eval_speakers
unique_speakers = list(set(all_speakers))
unique_speakers.sort()
return unique_speakers
# List all directories in the base directory
all_speakers = [
d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
]
random.shuffle(all_speakers)
# Calculate split sizes
total_speakers = len(all_speakers)
train_size = int(total_speakers * train_ratio)
dev_size = int(total_speakers * dev_ratio)
eval_size = total_speakers - train_size - dev_size
print("Total speakers:", total_speakers)
print("Train speakers:", train_size)
print("Dev speakers:", dev_size)
print("Eval speakers:", eval_size)
# Split directories
train_speakers = all_speakers[:train_size]
dev_speakers = all_speakers[train_size : train_size + dev_size]
eval_speakers = all_speakers[train_size + dev_size :]
# Function to move directories
def move_speakers(speakers, target_dir):
for speaker in speakers:
shutil.move(
os.path.join(base_dir, speaker), os.path.join(target_dir, speaker)
)
# Move directories
print("Moving directories...")
print("Moving Train speakers...")
move_speakers(train_speakers, train_dir)
print("Moving Dev speakers...")
move_speakers(dev_speakers, dev_dir)
print("Moving Eval speakers...")
move_speakers(eval_speakers, eval_dir)
unique_speakers = list(set(all_speakers))
unique_speakers.sort()
return unique_speakers
def save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers):
"""Save metadata for librilight dataset"""
os.makedirs(save_dir, exist_ok=True)
train_output_file = os.path.join(save_dir, "train.json")
valid_output_file = os.path.join(save_dir, "dev.json")
test_output_file = os.path.join(save_dir, "eval.json")
singer_dict_file = os.path.join(save_dir, "singers.json")
utt2singer_file = os.path.join(save_dir, "utt2singer")
utt2singer = open(utt2singer_file, "w")
if has_existed(train_output_file):
print("Metadata already exists. Skipping...")
return
train = []
test = []
valid = []
train_index_count = 0
test_index_count = 0
valid_index_count = 0
train_total_duration = 0
test_total_duration = 0
valid_total_duration = 0
# Save metadata
for distribution, speakers2utts in tqdm(distribution2speakers2utts.items()):
for speaker, utts in tqdm(speakers2utts.items()):
for chosen_uid in utts:
res = {
"Dataset": "librilight",
"Singer": speaker,
"Uid": "{}#{}#{}".format(distribution, speaker, chosen_uid),
}
res["Path"] = "{}/{}/{}.wav".format(distribution, speaker, chosen_uid)
res["Path"] = os.path.join(processed_dir, res["Path"])
assert os.path.exists(res["Path"])
text_file_path = os.path.join(
processed_dir,
distribution,
speaker,
chosen_uid + ".txt",
)
with open(text_file_path, "r") as f:
lines = f.readlines()
assert len(lines) == 1
text = lines[0].strip()
res["Text"] = text
waveform, sample_rate = torchaudio.load(res["Path"])
duration = waveform.size(-1) / sample_rate
res["Duration"] = duration
if "train" in distribution:
res["index"] = train_index_count
train_total_duration += duration
train.append(res)
train_index_count += 1
elif "dev" in distribution:
res["index"] = valid_index_count
valid_total_duration += duration
valid.append(res)
valid_index_count += 1
elif "eval" in distribution:
res["index"] = test_index_count
test_total_duration += duration
test.append(res)
test_index_count += 1
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
print("Done!")
print(
"Utterance count: train = {}, dev = {}, eval = {}".format(
len(train), len(valid), len(test)
)
)
print(
"#Train duration= {}, #Dev duration= {}, #Eval duration= {}".format(
train_total_duration / 3600,
valid_total_duration / 3600,
test_total_duration / 3600,
)
)
with open(train_output_file, "w") as f:
json.dump(train, f, indent=4, ensure_ascii=False)
with open(test_output_file, "w") as f:
json.dump(test, f, indent=4, ensure_ascii=False)
with open(valid_output_file, "w") as f:
json.dump(valid, f, indent=4, ensure_ascii=False)
utt2singer.close()
singer_lut = {name: i for i, name in enumerate(speakers)}
with open(singer_dict_file, "w") as f:
json.dump(singer_lut, f, indent=4, ensure_ascii=False)
print("Metadata saved to", save_dir)
def main(output_path, dataset_path, cfg):
"""Preprocess librilight dataset"""
n_cpus = cfg.n_cpus # number of cpus to use for preprocessing
n_gpus = cfg.n_gpus # number of gpus to use for transcription
cut_length = cfg.cut_length # target length of utterance in seconds
max_length = cfg.max_length # max length of utterance in seconds
# MFA files
mfa_config_path = cfg.mfa_config_path # path to mfa config file
mfa_dict_path = cfg.mfa_dict_path # path to mfa dict file
mfa_model_path = cfg.mfa_model_path # path to mfa model file
# check if mfa files exist
if (
not os.path.exists(mfa_dict_path)
or not os.path.exists(mfa_model_path)
or not os.path.exists(mfa_config_path)
):
raise Exception("MFA files not found.")
# Whisper model id
model_id = cfg.whisper_model_id # id of whisper model to use for transcription
subsets = [
d
for d in os.listdir(dataset_path)
if (
os.path.isdir(os.path.join(dataset_path, d))
and d in ["tiny", "small", "medium", "large"]
)
]
print("Found subsets:", subsets)
if len(subsets) == 0:
print("No subsets found. Exiting...")
return
# Preprocess each subset
for subset in subsets:
# Construct paths based on the base path
print("Pre-proccessing Libri-light subset:", subset)
raw_dir = f"{dataset_path}/{subset}"
save_dir = f"{output_path}/{subset}"
processed_dir = f"{dataset_path}/processed/{subset}"
os.makedirs(processed_dir, exist_ok=True)
os.makedirs(save_dir, exist_ok=True)
# Step 1: Segmentation
print("-" * 10)
print("Step 1: Segmentation")
print("Cutting audio files...")
cut_segments(raw_dir, processed_dir, cut_length, n_cpus)
# Steps 2 & 3: Filter and Preprocess
print("-" * 10)
print("Step 2 & 3: Filter and Preprocess")
print("Filtering and preprocessing audio files...")
wav_files = get_wav_files(processed_dir)
filtered_wav_files = filter_wav_files_by_length(wav_files, max_length)
process_wav_files(filtered_wav_files, processed_dir, n_cpus)
# Step 4 & 5: Transcription & Text-preprocess
print("-" * 10)
print("Step 4 & 5: Transcription & Text-preprocess")
print("Transcribing audio files...")
n_gpus = min(n_gpus, torch.cuda.device_count())
asr_main(processed_dir, n_gpus, model_id)
# Step 6: MFA Align
print("-" * 10)
print("Step 6: MFA Align")
print("Aligning audio files...")
command = [
"mfa",
"align",
"-v",
"-j",
str(n_cpus),
"-c",
mfa_config_path,
processed_dir,
mfa_dict_path,
mfa_model_path,
processed_dir,
"--output_format",
"long_textgrid",
"--clean",
"--overwrite",
]
subprocess.run(command, text=True)
# Step 7: train/dev/eval split
print("-" * 10)
print("Step 7: train/dev/eval split")
print("Splitting dataset by speaker...")
speakers = split_dataset_by_speaker(processed_dir)
# Step 8: Statistics
print("-" * 10)
print("Step 8: Statistics")
print("Calculating statistics...")
distribution2speakers2utts = librilight_statistics(processed_dir)
# Step 9: Save metadata
print("-" * 10)
print("Step 9: Save metadata")
print("Preparing Metadata for Librilight...")
save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers)
print("Preprocessing subset", subset, "done!")
print("-" * 10)
if __name__ == "__main__":
dataset_path = "/path/to/dataset/librilight"
output_path = "/path/to/output"
main(output_path, dataset_path)