Shivam Mehta commited on
Commit
c77ff04
·
1 Parent(s): 4afa303

Initial commit

Browse files
Files changed (4) hide show
  1. README.md +4 -3
  2. app.py +207 -0
  3. packages.txt +2 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Matcha TTS
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.44.3
8
  app_file: app.py
@@ -11,3 +11,4 @@ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
  title: Matcha TTS
3
+ emoji: 🍵
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.44.3
8
  app_file: app.py
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from argparse import Namespace
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import torch
8
+ from matcha.cli import (MATCHA_URLS, VOCODER_URL, assert_model_downloaded,
9
+ get_device, load_matcha, load_vocoder, process_text,
10
+ to_waveform)
11
+ from matcha.utils.utils import get_user_data_dir, plot_tensor
12
+
13
+ LOCATION = Path(get_user_data_dir())
14
+
15
+ args = Namespace(
16
+ cpu=False,
17
+ model="matcha_ljspeech",
18
+ vocoder="hifigan_T2_v1",
19
+ spk=None,
20
+ )
21
+
22
+ MATCHA_TTS_LOC = LOCATION / f"{args.model}.ckpt"
23
+ VOCODER_LOC = LOCATION / f"{args.vocoder}"
24
+ LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png"
25
+ assert_model_downloaded(MATCHA_TTS_LOC, MATCHA_URLS[args.model])
26
+ assert_model_downloaded(VOCODER_LOC, VOCODER_URL[args.vocoder])
27
+ device = get_device(args)
28
+
29
+ model = load_matcha(args.model, MATCHA_TTS_LOC, device)
30
+ vocoder, denoiser = load_vocoder(args.vocoder, VOCODER_LOC, device)
31
+
32
+
33
+ @torch.inference_mode()
34
+ def process_text_gradio(text):
35
+ output = process_text(1, text, device)
36
+ return output["x_phones"][1::2], output["x"], output["x_lengths"]
37
+
38
+
39
+ @torch.inference_mode()
40
+ def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale):
41
+ output = model.synthesise(
42
+ text,
43
+ text_length,
44
+ n_timesteps=n_timesteps,
45
+ temperature=temperature,
46
+ spks=args.spk,
47
+ length_scale=length_scale,
48
+ )
49
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
50
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
51
+ sf.write(fp.name, output["waveform"], 22050, "PCM_24")
52
+
53
+ return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
54
+
55
+
56
+ def run_full_synthesis(text, n_timesteps, mel_temp, length_scale):
57
+ phones, text, text_lengths = process_text_gradio(text)
58
+ audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale)
59
+ return phones, audio, mel_spectrogram
60
+
61
+
62
+ def main():
63
+ description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
64
+ ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
65
+ We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method:
66
+
67
+
68
+ * Is probabilistic
69
+ * Has compact memory footprint
70
+ * Sounds highly natural
71
+ * Is very fast to synthesise from
72
+
73
+
74
+ Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199).
75
+ Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models.
76
+
77
+ Cached examples are available at the bottom of the page.
78
+ """
79
+
80
+ with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo:
81
+ processed_text = gr.State(value=None)
82
+ processed_text_len = gr.State(value=None)
83
+
84
+ with gr.Box():
85
+ with gr.Row():
86
+ gr.Markdown(description, scale=3)
87
+ gr.Image(LOGO_URL, label="Matcha-TTS logo", height=150, width=150, scale=1, show_label=False)
88
+
89
+ with gr.Box():
90
+ with gr.Row():
91
+ gr.Markdown("# Text Input")
92
+ with gr.Row():
93
+ text = gr.Textbox(value="", lines=2, label="Text to synthesise")
94
+
95
+ with gr.Row():
96
+ gr.Markdown("### Hyper parameters")
97
+ with gr.Row():
98
+ n_timesteps = gr.Slider(
99
+ label="Number of ODE steps",
100
+ minimum=0,
101
+ maximum=100,
102
+ step=1,
103
+ value=10,
104
+ interactive=True,
105
+ )
106
+ length_scale = gr.Slider(
107
+ label="Length scale (Speaking rate)",
108
+ minimum=0.5,
109
+ maximum=1.5,
110
+ step=0.05,
111
+ value=1.0,
112
+ interactive=True,
113
+ )
114
+ mel_temp = gr.Slider(
115
+ label="Sampling temperature",
116
+ minimum=0.00,
117
+ maximum=2.001,
118
+ step=0.16675,
119
+ value=0.667,
120
+ interactive=True,
121
+ )
122
+
123
+ synth_btn = gr.Button("Synthesise")
124
+
125
+ with gr.Box():
126
+ with gr.Row():
127
+ gr.Markdown("### Phonetised text")
128
+ phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text")
129
+
130
+ with gr.Box():
131
+ with gr.Row():
132
+ mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram")
133
+
134
+ # with gr.Row():
135
+ audio = gr.Audio(interactive=False, label="Audio")
136
+
137
+ with gr.Row():
138
+ examples = gr.Examples( # pylint: disable=unused-variable
139
+ examples=[
140
+ [
141
+ "We propose Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up O D E-based speech synthesis.",
142
+ 50,
143
+ 0.677,
144
+ 1.0,
145
+ ],
146
+ [
147
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
148
+ 2,
149
+ 0.677,
150
+ 1.0,
151
+ ],
152
+ [
153
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
154
+ 4,
155
+ 0.677,
156
+ 1.0,
157
+ ],
158
+ [
159
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
160
+ 10,
161
+ 0.677,
162
+ 1.0,
163
+ ],
164
+ [
165
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
166
+ 50,
167
+ 0.677,
168
+ 1.0,
169
+ ],
170
+ [
171
+ "The narrative of these events is based largely on the recollections of the participants.",
172
+ 10,
173
+ 0.677,
174
+ 1.0,
175
+ ],
176
+ [
177
+ "The jury did not believe him, and the verdict was for the defendants.",
178
+ 10,
179
+ 0.677,
180
+ 1.0,
181
+ ],
182
+ ],
183
+ fn=run_full_synthesis,
184
+ inputs=[text, n_timesteps, mel_temp, length_scale],
185
+ outputs=[phonetised_text, audio, mel_spectrogram],
186
+ cache_examples=True,
187
+ )
188
+
189
+ synth_btn.click(
190
+ fn=process_text_gradio,
191
+ inputs=[
192
+ text,
193
+ ],
194
+ outputs=[phonetised_text, processed_text, processed_text_len],
195
+ api_name="matcha_tts",
196
+ queue=True,
197
+ ).then(
198
+ fn=synthesise_mel,
199
+ inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale],
200
+ outputs=[audio, mel_spectrogram],
201
+ )
202
+
203
+ demo.queue(concurrency_count=5).launch(share=True)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ espeak-ng
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ matcha-tts