File size: 2,833 Bytes
574ab7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import torch
import torchaudio
import pyloudnorm as pyln
from speechbrain.pretrained import EncoderClassifier

from IMSToucan.Preprocessing.AudioPreprocessor import AudioPreprocessor

VALID_VEC_TYPES = {'xvector', 'ecapa', 'ecapa+xvector'}


class DemoSpeakerEmbeddings:

    def __init__(self, vec_type='xvector', device=torch.device('cpu')):
        self.vec_type = vec_type
        assert self.vec_type in VALID_VEC_TYPES, f'Invalid vec_type {self.vec_type}, must be one of {VALID_VEC_TYPES}'
        self.device = device

        self.encoders = []
        if 'ecapa' in self.vec_type:
            self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-ecapa-voxceleb',
                                                                savedir='models/speaker_embeddings/spkrec-ecapa-voxceleb',
                                                                run_opts={'device': self.device}))
        if 'xvector' in self.vec_type:
            self.encoders.append(EncoderClassifier.from_hparams(source='speechbrain/spkrec-xvect-voxceleb',
                                                                savedir='models/speaker_embeddings/spkrec-xvect-voxceleb',
                                                                run_opts={'device': self.device}))

        self.ap = AudioPreprocessor(input_sr=48000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024,
                                    cut_silence=False)

    def extract_vector_from_audio(self, wave, sr):
        # adapted from IMSToucan/Preprocessing/AudioPreprocessor
        #norm_wave = self._normalize_wave(wave, sr)
        norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
        norm_wave = torch.tensor(np.trim_zeros(norm_wave.numpy()))

        spk_embs = [encoder.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze() for encoder in self.encoders]
        if len(spk_embs) == 1:
            return spk_embs[0]
        else:
            return torch.cat(spk_embs, dim=0)

    def _normalize_wave(self, wave, sr):
        # adapted from IMSToucan/Preprocessing/AudioPreprocessor
        wave = torch.tensor(wave)
        print(wave.shape)
        print(wave)
        dur = wave.shape[0] / sr
        wave = wave.squeeze().cpu().numpy()

        # normalize loudness
        meter = pyln.Meter(sr, block_size=min(dur - 0.0001, abs(dur - 0.1)) if dur < 0.4 else 0.4)
        loudness = meter.integrated_loudness(wave)
        loud_normed = pyln.normalize.loudness(wave, loudness, -30.0)
        peak = np.amax(np.abs(loud_normed))
        wave = np.divide(loud_normed, peak)

        wave = torch.Tensor(wave).to(self.device)

        if sr != 16000:
            wave = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000).to(self.device)(wave)

        return wave.cpu()