Spaces:
Running
Running
Florian Lux
commited on
Commit
·
e8958d3
1
Parent(s):
562e6d0
implement enjabements demo
Browse files- InferenceInterfaces/Meta_FastSpeech2.py +7 -3
- app.py +161 -139
- run_utterance_cloner.py +0 -2
InferenceInterfaces/Meta_FastSpeech2.py
CHANGED
@@ -40,9 +40,13 @@ class Meta_FastSpeech2(torch.nn.Module):
|
|
40 |
self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
|
41 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
42 |
|
43 |
-
def forward(self, text, view=False, durations=None, pitch=None, energy=None):
|
44 |
-
with torch.
|
45 |
-
phones
|
|
|
|
|
|
|
|
|
46 |
mel, durations, pitch, energy = self.phone2mel(phones,
|
47 |
return_duration_pitch_energy=True,
|
48 |
utterance_embedding=self.default_utterance_embedding,
|
|
|
40 |
self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
|
41 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
42 |
|
43 |
+
def forward(self, text, view=False, durations=None, pitch=None, energy=None, phones = False):
|
44 |
+
with torch.inference_mode():
|
45 |
+
if phones is False:
|
46 |
+
phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
|
47 |
+
else:
|
48 |
+
phones = self.text2phone.string_to_tensor(text, input_phonemes=True).to(torch.device(self.device))
|
49 |
+
|
50 |
mel, durations, pitch, energy = self.phone2mel(phones,
|
51 |
return_duration_pitch_energy=True,
|
52 |
utterance_embedding=self.default_utterance_embedding,
|
app.py
CHANGED
@@ -1,18 +1,15 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import numpy as np
|
5 |
-
import soundfile as sf
|
6 |
import torch
|
7 |
-
|
8 |
from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
|
9 |
-
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
10 |
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
|
11 |
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
|
12 |
from run_utterance_cloner import UtteranceCloner
|
13 |
|
14 |
-
|
15 |
-
os.system("pip
|
|
|
16 |
|
17 |
|
18 |
def float2pcm(sig, dtype='int16'):
|
@@ -36,148 +33,173 @@ class TTS_Interface:
|
|
36 |
def __init__(self):
|
37 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
38 |
self.utterance_cloner = UtteranceCloner(device=self.device)
|
39 |
-
self.
|
40 |
-
"Voice 1": "reference_audios/voice_1.flac",
|
41 |
-
"Voice 2": "reference_audios/voice_2.wav",
|
42 |
-
"Voice 3": "reference_audios/voice_3.wav",
|
43 |
-
}
|
44 |
self.acoustic_model = Aligner()
|
45 |
self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
|
46 |
self.acoustic_model = self.acoustic_model.to(self.device)
|
47 |
self.dc = DurationCalculator(reduction_factor=1)
|
48 |
self.tf = ArticulatoryCombinedTextFrontend(language="en")
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
self.
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
self.
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
self.utterance_cloner.tts
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
type="value",
|
161 |
-
default="
|
162 |
-
label="
|
163 |
-
gr.inputs.Dropdown(["Voice
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
gr.inputs.
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
gr.outputs.Audio(type="
|
174 |
-
gr.outputs.Audio(type="numpy", label="Customized Audio")],
|
175 |
layout="vertical",
|
176 |
-
title="
|
177 |
thumbnail="Utility/toucan.png",
|
178 |
theme="default",
|
179 |
allow_flagging="never",
|
180 |
allow_screenshot=False,
|
181 |
-
description="
|
182 |
article=article)
|
183 |
iface.launch(enable_queue=True)
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
|
|
3 |
import torch
|
4 |
+
import math
|
5 |
from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
|
|
|
6 |
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
|
7 |
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
|
8 |
from run_utterance_cloner import UtteranceCloner
|
9 |
|
10 |
+
|
11 |
+
# os.system("pip uninstall -y gradio")
|
12 |
+
# os.system("pip install gradio==2.7.5.2")
|
13 |
|
14 |
|
15 |
def float2pcm(sig, dtype='int16'):
|
|
|
33 |
def __init__(self):
|
34 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
35 |
self.utterance_cloner = UtteranceCloner(device=self.device)
|
36 |
+
self.utterance_cloner.tts.set_language("de")
|
|
|
|
|
|
|
|
|
37 |
self.acoustic_model = Aligner()
|
38 |
self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
|
39 |
self.acoustic_model = self.acoustic_model.to(self.device)
|
40 |
self.dc = DurationCalculator(reduction_factor=1)
|
41 |
self.tf = ArticulatoryCombinedTextFrontend(language="en")
|
42 |
+
self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
|
43 |
+
reference_audio = "reference_audios/2.wav"
|
44 |
+
self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=False)
|
45 |
+
self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
|
46 |
+
|
47 |
+
#######
|
48 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
|
49 |
+
self.current_voice = "male"
|
50 |
+
self.cloned_speech_male = self.utterance_cloner.tts(self.phones,
|
51 |
+
view=False,
|
52 |
+
durations=self.duration,
|
53 |
+
pitch=self.pitch,
|
54 |
+
energy=self.energy,
|
55 |
+
phones=True).cpu().numpy()
|
56 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
|
57 |
+
self.current_voice = "female"
|
58 |
+
self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
|
59 |
+
view=False,
|
60 |
+
durations=self.duration,
|
61 |
+
pitch=self.pitch,
|
62 |
+
energy=self.energy,
|
63 |
+
phones=True).cpu().numpy()
|
64 |
+
|
65 |
+
#######
|
66 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
|
67 |
+
self.current_voice = "male"
|
68 |
+
self.reg_speech_male = self.utterance_cloner.tts(
|
69 |
+
"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
|
70 |
+
view=False).cpu().numpy()
|
71 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
|
72 |
+
self.current_voice = "female"
|
73 |
+
self.reg_speech_female = self.utterance_cloner.tts(
|
74 |
+
"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
|
75 |
+
view=False).cpu().numpy()
|
76 |
+
|
77 |
+
def read(self, _, speaker, lengthening, pause_dur, pitch_up):
|
78 |
+
|
79 |
+
if speaker == "Female Voice" and self.current_voice != "female":
|
80 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
|
81 |
+
self.current_voice = "female"
|
82 |
+
elif speaker == "Male Voice" and self.current_voice != "male":
|
83 |
+
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
|
84 |
+
self.current_voice = "male"
|
85 |
+
|
86 |
+
duration = self.duration.clone()
|
87 |
+
# lengthening
|
88 |
+
lenghtening_candidates = [ # ('f', 27),
|
89 |
+
# ('l', 28),
|
90 |
+
('ʏ', 29),
|
91 |
+
('ç', 30),
|
92 |
+
# ('t', 31),
|
93 |
+
('ɪ', 32),
|
94 |
+
# ('ɡ', 33),
|
95 |
+
('ə', 34),
|
96 |
+
('n', 35),
|
97 |
+
|
98 |
+
# ('z', 66),
|
99 |
+
('ɑ', 67),
|
100 |
+
# ('ə', 68),
|
101 |
+
('n', 69),
|
102 |
+
|
103 |
+
# ('b', 84),
|
104 |
+
('e', 85),
|
105 |
+
# ('p', 86),
|
106 |
+
# ('t', 87),
|
107 |
+
('ə', 88)
|
108 |
+
]
|
109 |
+
|
110 |
+
for lenghtening_candidate in lenghtening_candidates:
|
111 |
+
duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
|
112 |
+
|
113 |
+
# pauses
|
114 |
+
pause_candidates = [('~', 36),
|
115 |
+
('~', 70),
|
116 |
+
('~', 89)]
|
117 |
+
|
118 |
+
for pause_candidate in pause_candidates:
|
119 |
+
duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
|
120 |
+
|
121 |
+
pitch = self.pitch.clone()
|
122 |
+
# pitch raise
|
123 |
+
|
124 |
+
pitch_candidates = [ # ('k', 37),
|
125 |
+
('y', 38),
|
126 |
+
('l', 39),
|
127 |
+
('ə', 40),
|
128 |
+
('ʃ', 41),
|
129 |
+
('a', 42),
|
130 |
+
('t', 43),
|
131 |
+
# ('ə', 44),
|
132 |
+
# ('n', 45),
|
133 |
+
|
134 |
+
('a', 71),
|
135 |
+
('l', 72),
|
136 |
+
|
137 |
+
('v', 96),
|
138 |
+
('ɛ', 97),
|
139 |
+
('l', 98),
|
140 |
+
# ('ə', 99),
|
141 |
+
# ('n', 100)
|
142 |
+
]
|
143 |
+
|
144 |
+
for pitch_candidate in pitch_candidates:
|
145 |
+
pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
|
146 |
+
|
147 |
+
fixme = [('f', 27),
|
148 |
+
('l', 28),
|
149 |
+
('ʏ', 29),
|
150 |
+
('ç', 30),
|
151 |
+
('t', 31),
|
152 |
+
('ɪ', 32),
|
153 |
+
('ɡ', 33),
|
154 |
+
('ə', 34),
|
155 |
+
('n', 35)
|
156 |
+
]
|
157 |
+
for pitch_candidate in fixme:
|
158 |
+
pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)
|
159 |
+
|
160 |
+
manipulated_speech = self.utterance_cloner.tts(self.phones,
|
161 |
+
view=False,
|
162 |
+
durations=duration,
|
163 |
+
pitch=pitch,
|
164 |
+
energy=self.energy,
|
165 |
+
phones=True).cpu()
|
166 |
+
|
167 |
+
if self.current_voice == "female":
|
168 |
+
cloned_speech = self.cloned_speech_female
|
169 |
+
reg_speech = self.reg_speech_female
|
170 |
+
else:
|
171 |
+
cloned_speech = self.cloned_speech_male
|
172 |
+
reg_speech = self.reg_speech_male
|
173 |
+
|
174 |
+
return (48000, float2pcm(reg_speech)), (48000, float2pcm(cloned_speech)), (48000, float2pcm(manipulated_speech.numpy()))
|
175 |
+
|
176 |
+
|
177 |
+
poem_model = TTS_Interface()
|
178 |
+
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning and more controllability. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
|
179 |
+
|
180 |
+
iface = gr.Interface(fn=poem_model.read,
|
181 |
+
inputs=[gr.inputs.Dropdown([
|
182 |
+
"Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild."],
|
183 |
type="value",
|
184 |
+
default="Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild.",
|
185 |
+
label="Poem Transcript"),
|
186 |
+
gr.inputs.Dropdown(["Female Voice", "Male Voice"],
|
187 |
+
type="value",
|
188 |
+
default="Female Voice",
|
189 |
+
label="Select a Speaker"),
|
190 |
+
gr.inputs.Slider(minimum=0, maximum=4, step=1, default=2, label="Lengthening on verse end"),
|
191 |
+
gr.inputs.Slider(minimum=0, maximum=20, step=1, default=8, label="Length of Pause after verse end"),
|
192 |
+
gr.inputs.Slider(minimum=-0.4, maximum=0.4, step=0.01, default=0.2, label="Raise Pitch on new verse")
|
193 |
+
],
|
194 |
+
outputs=[gr.outputs.Audio(type="numpy", label="Poem read with prose reading"),
|
195 |
+
gr.outputs.Audio(type="numpy", label="Poem cloned from a reference"),
|
196 |
+
gr.outputs.Audio(type="numpy", label="Poem after human-in-the-loop adjustments")],
|
|
|
197 |
layout="vertical",
|
198 |
+
title="PoeticTTS - Customizing Poetry for Literary Studies",
|
199 |
thumbnail="Utility/toucan.png",
|
200 |
theme="default",
|
201 |
allow_flagging="never",
|
202 |
allow_screenshot=False,
|
203 |
+
description="Customize how a poem is read by a text-to-speech system with intuitive high-level controls. You can control markers of syntactic phrasing ",
|
204 |
article=article)
|
205 |
iface.launch(enable_queue=True)
|
run_utterance_cloner.py
CHANGED
@@ -82,8 +82,6 @@ class UtteranceCloner:
|
|
82 |
torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0)
|
83 |
optim_asr.step()
|
84 |
acoustic_model.eval()
|
85 |
-
torch.save({"asr_model": acoustic_model.state_dict()},
|
86 |
-
os.path.join(os.path.join("Models", "Aligner", "aligner.pt")))
|
87 |
|
88 |
alignment_path = acoustic_model.inference(mel=melspec.to(self.device),
|
89 |
tokens=text.to(self.device),
|
|
|
82 |
torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0)
|
83 |
optim_asr.step()
|
84 |
acoustic_model.eval()
|
|
|
|
|
85 |
|
86 |
alignment_path = acoustic_model.inference(mel=melspec.to(self.device),
|
87 |
tokens=text.to(self.device),
|