File size: 5,933 Bytes
574ab7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import gradio as gr
import numpy as np
import torch
from pathlib import Path

os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.2")

from demo_inference.demo_tts import DemoTTS
from demo_inference.demo_asr import DemoASR
from demo_inference.demo_anonymization import DemoAnonymizer


def pcm2float(sig, dtype='float32'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind not in 'iu':
        raise TypeError("'sig' must be an array of integers")
    dtype = np.dtype(dtype)
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(sig.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig.astype(dtype) - offset) / abs_max


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class VPInterface:

    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.path_to_tts_models = Path('models', 'tts')
        self.path_to_asr_model = Path('models', 'asr')
        self.path_to_anon_model = Path('models', 'anonymization')

        self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag='Libri100',
                                       device=self.device)
        self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag='phones', device=self.device)
        self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag='pool',
                                         device=self.device)

    def read(self, recording, asr_model_tag, anon_model_tag, tts_model_tag):
        sr, audio = recording
        audio = pcm2float(audio)

        self._check_models(asr_model_tag, anon_model_tag, tts_model_tag)

        text_is_phonemes = (self.asr_model.model_tag == 'phones')
        text = self.asr_model.recognize_speech(audio, sr)
        print(text)
        speaker_embedding = self.anon_model.anonymize_embedding(audio, sr)
        print(speaker_embedding)
        syn_audio = self.synthesis_model.read_text(transcription=text, speaker_embedding=speaker_embedding,
                                                   text_is_phonemes=text_is_phonemes)

        return 48000, float2pcm(syn_audio.cpu().numpy())

    def _check_models(self, asr_model_tag, anon_model_tag, tts_model_tag):
        if asr_model_tag != self.asr_model.model_tag:
            self.asr_model = DemoASR(model_path=self.path_to_asr_model, model_tag=asr_model_tag, device=self.device)
        if anon_model_tag != self.anon_model.model_tag:
            self.anon_model = DemoAnonymizer(model_path=self.path_to_anon_model, model_tag=anon_model_tag,
                                             device=self.device)
        if tts_model_tag != self.synthesis_model.model_tag:
            self.synthesis_model = DemoTTS(model_paths=self.path_to_tts_models, model_tag=tts_model_tag,
                                           device=self.device)


model = VPInterface()

article = """
This demo allows you to anonymize your input speech by defining the single models for ASR, anonymization and TTS. If 
you want to know more about each model, please read the paper linked above. Every time you click the *submit* button, 
you should receive a new voice.

Note that for *pool* anonymization in this demo, we are using a different scaling approach (
sklearn.preprocessing.StandardScaler instead of sklearn.preprocessing.MinMaxScaler) because we are processing only 
one sample at a time and would otherwise always end up with the same voice.

This demo is still work in progress, so please be lenient with possible low quality and errors. Also, be aware that 
this Huggingface space runs on CPU which makes the demo quite slow.

For more information about this system, visit our Github page: [https://github.com/DigitalPhonetics/speaker-anonymization](https://github.com/DigitalPhonetics/speaker-anonymization)
"""

description = """
## Test demo corresponding to the models in our paper [Speaker Anonymization with Phonetic Intermediate Representations](https://arxiv.org/abs/2207.04834)
"""

css = """
.gr-button-primary {background-color: green !important, border-color: green}
"""

iface = gr.Interface(fn=model.read,
                     inputs=[gr.inputs.Audio(source='microphone', type='numpy', label='Say a sentence in English.'),
                             gr.inputs.Dropdown(['phones', 'STT', 'TTS'], type='value', default='phones',
                                                label='ASR model'),
                             gr.inputs.Dropdown(['pool', 'random', 'pool raw'], type='value', default='pool',
                                                label='Anonymization'),
                             gr.inputs.Dropdown(['Libri100', 'Libri100 + finetuned', 'Libri600',
                                                 'Libri600 + finetuned'], type='value', default='Libri100',
                                                label='TTS model')
                             ],
                     outputs=gr.outputs.Audio(type='numpy', label=None),
                     layout='vertical',
                     title='IMS Speaker Anonymization',
                     description=description,
                     theme='default',
                     allow_flagging='never',
                     article=article,
                     allow_screenshot=False)
iface.launch(enable_queue=True)