|
import os |
|
import glob |
|
import numpy |
|
import argparse |
|
import torchaudio |
|
from speechbrain.pretrained import EncoderClassifier |
|
import torch |
|
from tqdm import tqdm |
|
import torch.nn.functional as F |
|
|
|
spk_model = { |
|
"speechbrain/spkrec-xvect-voxceleb": 512, |
|
"speechbrain/spkrec-ecapa-voxceleb": 192, |
|
} |
|
|
|
def f2embed(wav_file, classifier, size_embed): |
|
signal, fs = torchaudio.load(wav_file) |
|
assert fs == 16000, fs |
|
with torch.no_grad(): |
|
embeddings = classifier.encode_batch(signal) |
|
embeddings = F.normalize(embeddings, dim=2) |
|
embeddings = embeddings.squeeze().cpu().numpy() |
|
assert embeddings.shape[0] == size_embed, embeddings.shape[0] |
|
return embeddings |
|
|
|
def process(args): |
|
wavlst = [] |
|
for split in args.splits.split(","): |
|
wav_dir = os.path.join(args.arctic_root, split) |
|
wavlst_split = glob.glob(os.path.join(wav_dir, "wav", "*.wav")) |
|
print(f"{split} {len(wavlst_split)} utterances.") |
|
wavlst.extend(wavlst_split) |
|
|
|
spkemb_root = args.output_root |
|
if not os.path.exists(spkemb_root): |
|
print(f"Create speaker embedding directory: {spkemb_root}") |
|
os.mkdir(spkemb_root) |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', args.speaker_embed)) |
|
size_embed = spk_model[args.speaker_embed] |
|
for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"): |
|
|
|
utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "") |
|
utt_emb = f2embed(utt_i, classifier, size_embed) |
|
numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb) |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--arctic-root", "-i", required=True, type=str, help="LibriTTS root directory.") |
|
parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.") |
|
parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"], |
|
help="Pretrained model for extracting speaker emebdding.") |
|
parser.add_argument("--splits", type=str, help="Split of four speakers seperate by comma.", |
|
default="cmu_us_bdl_arctic,cmu_us_clb_arctic,cmu_us_rms_arctic,cmu_us_slt_arctic") |
|
args = parser.parse_args() |
|
print(f"Loading utterances from {args.arctic_root}/{args.splits}, " |
|
+ f"Save speaker embedding 'npy' to {args.output_root}, " |
|
+ f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.") |
|
process(args) |
|
|
|
if __name__ == "__main__": |
|
""" |
|
python utils/prep_cmu_arctic_spkemb.py \ |
|
-i /root/data/cmu_arctic/CMUARCTIC \ |
|
-o /root/data/cmu_arctic/CMUARCTIC/spkrec-xvect \ |
|
-s speechbrain/spkrec-xvect-voxceleb |
|
""" |
|
main() |
|
|