Spaces:
Running
Running
import numpy as np | |
import torch | |
import torchaudio | |
import pyloudnorm as pyln | |
from speechbrain.pretrained import EncoderClassifier | |
from IMSToucan.Preprocessing.AudioPreprocessor import AudioPreprocessor | |
VALID_VEC_TYPES = {'xvector', 'ecapa', 'ecapa+xvector'} | |
class DemoSpeakerEmbeddings: | |
def __init__(self, vec_type='xvector', device=torch.device('cpu')): | |
self.vec_type = vec_type | |
assert self.vec_type in VALID_VEC_TYPES, f'Invalid vec_type {self.vec_type}, must be one of {VALID_VEC_TYPES}' | |
self.device = device | |
self.encoders = [] | |
if 'ecapa' in self.vec_type: | |
self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-ecapa-voxceleb', | |
savedir='models/speaker_embeddings/spkrec-ecapa-voxceleb', | |
run_opts={'device': self.device})) | |
if 'xvector' in self.vec_type: | |
self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-xvect-voxceleb', | |
savedir='models/speaker_embeddings/spkrec-xvect-voxceleb', | |
run_opts={'device': self.device})) | |
self.ap = AudioPreprocessor(input_sr=48000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, | |
cut_silence=False) | |
def extract_vector_from_audio(self, wave, sr): | |
# adapted from IMSToucan/Preprocessing/AudioPreprocessor | |
#norm_wave = self._normalize_wave(wave, sr) | |
norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave) | |
norm_wave = torch.tensor(np.trim_zeros(norm_wave.numpy())) | |
spk_embs = [encoder.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze() for encoder in self.encoders] | |
if len(spk_embs) == 1: | |
return spk_embs[0] | |
else: | |
return torch.cat(spk_embs, dim=0) | |
def _normalize_wave(self, wave, sr): | |
# adapted from IMSToucan/Preprocessing/AudioPreprocessor | |
wave = torch.tensor(wave) | |
print(wave.shape) | |
print(wave) | |
dur = wave.shape[0] / sr | |
wave = wave.squeeze().cpu().numpy() | |
# normalize loudness | |
meter = pyln.Meter(sr, block_size=min(dur - 0.0001, abs(dur - 0.1)) if dur < 0.4 else 0.4) | |
loudness = meter.integrated_loudness(wave) | |
loud_normed = pyln.normalize.loudness(wave, loudness, -30.0) | |
peak = np.amax(np.abs(loud_normed)) | |
wave = np.divide(loud_normed, peak) | |
wave = torch.Tensor(wave).to(self.device) | |
if sr != 16000: | |
wave = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000).to(self.device)(wave) | |
return wave.cpu() | |