hexgrad commited on
Commit
9b9ab2a
·
verified ·
1 Parent(s): 480e1dc

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +33 -45
  2. requirements.txt +0 -1
app.py CHANGED
@@ -2,7 +2,6 @@ from huggingface_hub import snapshot_download
2
  from katsu import Katsu
3
  from models import build_model
4
  import gradio as gr
5
- import noisereduce as nr
6
  import numpy as np
7
  import os
8
  import phonemizer
@@ -112,33 +111,15 @@ def tokenize(ps):
112
  # ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
113
  CHOICES = {
114
  '🇺🇸 🚺 American Female ⭐': 'af',
115
- '🇺🇸 🚺 American Female 1': 'af_1',
116
- '🇺🇸 🚺 Alloy 🧪': 'af_alloy',
117
  '🇺🇸 🚺 Bella': 'af_bella',
118
- '🇺🇸 🚺 Jessica 🧪': 'af_jessica',
119
- '🇺🇸 🚺 Nicole': 'af_nicole',
120
- '🇺🇸 🚺 Nova 🧪': 'af_nova',
121
- '🇺🇸 🚺 River 🧪': 'af_river',
122
  '🇺🇸 🚺 Sarah': 'af_sarah',
123
  '🇺🇸 🚺 Sky 🧪': 'af_sky',
124
- '🇺🇸 🚹 Adam': 'am_adam',
125
- '🇺🇸 🚹 Echo 🧪': 'am_echo',
126
- '🇺🇸 🚹 Eric 🧪': 'am_eric',
127
- '🇺🇸 🚹 Liam 🧪': 'am_liam',
128
  '🇺🇸 🚹 Michael': 'am_michael',
129
- '🇺🇸 🚹 Onyx 🧪': 'am_onyx',
130
- '🇬🇧 🚺 British Female 0': 'bf_0',
131
- '🇬🇧 🚺 Alice 🧪': 'bf_alice',
132
- '🇬🇧 🚺 Lily 🧪': 'bf_lily',
133
- '🇬🇧 🚹 British Male 0': 'bm_0',
134
- '🇬🇧 🚹 British Male 1': 'bm_1',
135
- '🇬🇧 🚹 British Male 2': 'bm_2',
136
- '🇬🇧 🚹 Daniel 🧪': 'bm_daniel',
137
- '🇬🇧 🚹 Fable 🧪': 'bm_fable',
138
- '🇬🇧 🚹 George 🧪': 'bm_george',
139
- '🇯🇵 🚺 Japanese Female 0': 'jf_0',
140
  }
141
- VOICES = {k: torch.load(os.path.join(snapshot, 'voices', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
142
 
143
  np_log_99 = np.log(99)
144
  def s_curve(p):
@@ -155,7 +136,7 @@ SAMPLE_RATE = 24000
155
  @spaces.GPU(duration=10)
156
  @torch.no_grad()
157
  def forward(tokens, voice, speed):
158
- ref_s = VOICES[voice]
159
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
160
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
161
  text_mask = length_to_mask(input_lengths).to(device)
@@ -178,7 +159,7 @@ def forward(tokens, voice, speed):
178
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
179
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
180
 
181
- def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
182
  if voice not in VOICES:
183
  # Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
184
  voice = 'af'
@@ -194,8 +175,6 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
194
  except gr.exceptions.Error as e:
195
  raise gr.Error(e)
196
  return (None, '')
197
- if reduce_noise > 0:
198
- out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
199
  opening_cut = int(opening_cut / speed)
200
  if opening_cut > 0:
201
  out = out[opening_cut:]
@@ -216,6 +195,9 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
216
  out = np.concatenate([out, np.zeros(pad_after)])
217
  return ((SAMPLE_RATE, out), ps)
218
 
 
 
 
219
  with gr.Blocks() as basic_tts:
220
  with gr.Row():
221
  gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
@@ -234,12 +216,12 @@ with gr.Blocks() as basic_tts:
234
  phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
235
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
236
  with gr.Column():
237
- audio = gr.Audio(interactive=False, label='Output Audio')
238
  with gr.Accordion('Output Tokens', open=True):
239
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
240
  with gr.Accordion('Audio Settings', open=False):
241
  with gr.Row():
242
- reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
243
  with gr.Row():
244
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
245
  with gr.Row():
@@ -257,15 +239,18 @@ with gr.Blocks() as basic_tts:
257
  pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
258
  with gr.Column():
259
  pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
260
- generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
 
 
261
 
262
  @spaces.GPU
263
  @torch.no_grad()
264
  def lf_forward(token_lists, voice, speed):
265
- ref_s = VOICES[voice]
266
- s = ref_s[:, 128:]
267
  outs = []
268
  for tokens in token_lists:
 
 
269
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
270
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
271
  text_mask = length_to_mask(input_lengths).to(device)
@@ -340,7 +325,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
340
  segments = [row for t in texts for row in recursive_split(t, voice)]
341
  return [(i, *row) for i, row in enumerate(segments)]
342
 
343
- def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000, pad_between=10000):
344
  token_lists = list(map(tokenize, segments['Tokens']))
345
  wavs = []
346
  opening_cut = int(opening_cut / speed)
@@ -357,8 +342,6 @@ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000,
357
  raise gr.Error(e)
358
  break
359
  for out in outs:
360
- if reduce_noise > 0:
361
- out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
362
  if opening_cut > 0:
363
  out = out[opening_cut:]
364
  if closing_cut > 0:
@@ -415,8 +398,6 @@ with gr.Blocks() as lf_tts:
415
  with gr.Column():
416
  audio = gr.Audio(interactive=False, label='Output Audio')
417
  with gr.Accordion('Audio Settings', open=False):
418
- with gr.Row():
419
- reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
420
  with gr.Row():
421
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
422
  with gr.Row():
@@ -440,7 +421,7 @@ with gr.Blocks() as lf_tts:
440
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
441
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
442
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
443
- generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
444
 
445
  with gr.Blocks() as about:
446
  gr.Markdown("""
@@ -453,11 +434,6 @@ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](http
453
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
454
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
455
 
456
- ### Updates
457
- This Space and the underlying Kokoro model are both under development and subject to change.<br/>
458
- Last model update: 2024 Nov 15<br/>
459
- Model trained by: Raven (@rzvzn on Discord)
460
-
461
  ### Licenses
462
  Inference code: MIT<br/>
463
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
@@ -471,6 +447,9 @@ Random Japanese texts: CC0 public domain<sup>[6]</sup>
471
  4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
472
  5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
473
  6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
 
 
 
474
  """)
475
 
476
  with gr.Blocks() as api_info:
@@ -499,10 +478,19 @@ print(out_ps)
499
  Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
500
  """)
501
 
 
 
 
 
 
 
 
 
 
502
  with gr.Blocks() as app:
503
  gr.TabbedInterface(
504
- [basic_tts, lf_tts, about, api_info],
505
- ['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API'],
506
  )
507
 
508
  if __name__ == '__main__':
 
2
  from katsu import Katsu
3
  from models import build_model
4
  import gradio as gr
 
5
  import numpy as np
6
  import os
7
  import phonemizer
 
111
  # ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
112
  CHOICES = {
113
  '🇺🇸 🚺 American Female ⭐': 'af',
 
 
114
  '🇺🇸 🚺 Bella': 'af_bella',
 
 
 
 
115
  '🇺🇸 🚺 Sarah': 'af_sarah',
116
  '🇺🇸 🚺 Sky 🧪': 'af_sky',
117
+ '🇺🇸 🚹 Adam 🧪': 'am_adam',
 
 
 
118
  '🇺🇸 🚹 Michael': 'am_michael',
119
+ '🇬🇧 🚹 Lewis': 'bm_lewis',
120
+ '🇯🇵 🚺 Japanese Female 🧪': 'jf_0',
 
 
 
 
 
 
 
 
 
121
  }
122
+ VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
123
 
124
  np_log_99 = np.log(99)
125
  def s_curve(p):
 
136
  @spaces.GPU(duration=10)
137
  @torch.no_grad()
138
  def forward(tokens, voice, speed):
139
+ ref_s = VOICES[voice][len(tokens)]
140
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
141
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
142
  text_mask = length_to_mask(input_lengths).to(device)
 
159
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
160
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
161
 
162
+ def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
163
  if voice not in VOICES:
164
  # Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
165
  voice = 'af'
 
175
  except gr.exceptions.Error as e:
176
  raise gr.Error(e)
177
  return (None, '')
 
 
178
  opening_cut = int(opening_cut / speed)
179
  if opening_cut > 0:
180
  out = out[opening_cut:]
 
195
  out = np.concatenate([out, np.zeros(pad_after)])
196
  return ((SAMPLE_RATE, out), ps)
197
 
198
+ def toggle_autoplay(autoplay):
199
+ return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
200
+
201
  with gr.Blocks() as basic_tts:
202
  with gr.Row():
203
  gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
 
216
  phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
217
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
218
  with gr.Column():
219
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
220
  with gr.Accordion('Output Tokens', open=True):
221
  out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
222
  with gr.Accordion('Audio Settings', open=False):
223
  with gr.Row():
224
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
225
  with gr.Row():
226
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
227
  with gr.Row():
 
239
  pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
240
  with gr.Column():
241
  pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
242
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
243
+ text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
244
+ generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
245
 
246
  @spaces.GPU
247
  @torch.no_grad()
248
  def lf_forward(token_lists, voice, speed):
249
+ voicepack = VOICES[voice]
 
250
  outs = []
251
  for tokens in token_lists:
252
+ ref_s = voicepack[len(tokens)]
253
+ s = ref_s[:, 128:]
254
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
255
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
256
  text_mask = length_to_mask(input_lengths).to(device)
 
325
  segments = [row for t in texts for row in recursive_split(t, voice)]
326
  return [(i, *row) for i, row in enumerate(segments)]
327
 
328
+ def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000, pad_between=10000):
329
  token_lists = list(map(tokenize, segments['Tokens']))
330
  wavs = []
331
  opening_cut = int(opening_cut / speed)
 
342
  raise gr.Error(e)
343
  break
344
  for out in outs:
 
 
345
  if opening_cut > 0:
346
  out = out[opening_cut:]
347
  if closing_cut > 0:
 
398
  with gr.Column():
399
  audio = gr.Audio(interactive=False, label='Output Audio')
400
  with gr.Accordion('Audio Settings', open=False):
 
 
401
  with gr.Row():
402
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
403
  with gr.Row():
 
421
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
422
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
423
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
424
+ generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
425
 
426
  with gr.Blocks() as about:
427
  gr.Markdown("""
 
434
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
435
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
436
 
 
 
 
 
 
437
  ### Licenses
438
  Inference code: MIT<br/>
439
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
 
447
  4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
448
  5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
449
  6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
450
+
451
+ ### Contact
452
+ @rzvzn on Discord
453
  """)
454
 
455
  with gr.Blocks() as api_info:
 
478
  Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
479
  """)
480
 
481
+ with gr.Blocks() as version_info:
482
+ gr.Markdown("""
483
+ | Model Version | Date | Validation losses (mel/dur/f0) |
484
+ | ------- | ---- | ------------------------------ |
485
+ | v0.19 | 2024 Nov 22 | 0.261 / 0.627 / 1.897 |
486
+ | v0.16 | 2024 Nov 15 | 0.263 / 0.646 / 1.934 |
487
+ | v0.14 | 2024 Nov 12 | 0.262 / 0.642 / 1.889 |
488
+ """)
489
+
490
  with gr.Blocks() as app:
491
  gr.TabbedInterface(
492
+ [basic_tts, lf_tts, about, api_info, version_info],
493
+ ['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API', '📝 Version History'],
494
  )
495
 
496
  if __name__ == '__main__':
requirements.txt CHANGED
@@ -2,7 +2,6 @@ fugashi
2
  gradio
3
  mojimoji
4
  munch
5
- noisereduce
6
  phonemizer
7
  pypdf
8
  scipy
 
2
  gradio
3
  mojimoji
4
  munch
 
5
  phonemizer
6
  pypdf
7
  scipy