Pendrokar commited on
Commit
79f2035
ยท
2 Parent(s): ac301d1 74e078c

Merge branch 'more_radio'

Browse files
Files changed (2) hide show
  1. app.py +105 -513
  2. gr_client.py +394 -287
app.py CHANGED
@@ -1,16 +1,14 @@
1
  import os
2
  import sys
3
- import time
4
  import requests
5
  import json
6
- from subprocess import Popen, PIPE
7
- import threading
8
  from huggingface_hub import HfApi
9
- import gradio as gr
10
 
11
  # start xVASynth service (no HTTP)
12
  import resources.app.no_server as xvaserver
13
 
 
 
14
  # model
15
  hf_model_name = "Pendrokar/xvapitch_nvidia"
16
  model_repo = HfApi()
@@ -19,117 +17,9 @@ latest_commit_sha = commits[0].commit_id
19
  hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
20
  models_path = hf_cache_models_path
21
 
22
- # ordered from most emotional and respects pauses to ones that do the least
23
- voice_models = [
24
- ("๐Ÿ‘จโ€๐Ÿฆณ #6671", "ccby_nvidia_hifi_6671_M"),
25
- ("๐Ÿ‘ฑโ€โ™€๏ธ ๐Ÿ‡ฌ๐Ÿ‡ง #92", "ccby_nvidia_hifi_92_F"),
26
- ("๐Ÿง” #6670", "ccby_nvidia_hifi_6670_M"),
27
- ("Male #9017", "ccby_nvidia_hifi_9017_M"),
28
- ("Male #6097", "ccby_nvidia_hifi_6097_M"),
29
- ("๐Ÿ‘ฉโ€๐Ÿฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
30
- ("๐Ÿ‘ต #11614", "ccby_nv_hifi_11614_F"),
31
- ("Female #8051", "ccby_nvidia_hifi_8051_F"),
32
- ("๐Ÿ‘ฉโ€๐Ÿฆณ #11697", "ccby_nvidia_hifi_11697_F"),
33
- ("Female #9136", "ccby_nvidia_hifi_9136_F"),
34
- ]
35
-
36
  current_voice_model = None
37
  base_speaker_emb = ''
38
 
39
- # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
40
- languages = [
41
- ("๐Ÿ‡บ๐Ÿ‡ธ EN", "en"),
42
- ("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
43
- ("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
44
- ("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
45
- ("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
46
- ("๐Ÿ‡ง๐Ÿ‡ท PT", "pt"),
47
- ("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
48
- ("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
49
- ("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
50
- ("๐Ÿ‡ฉ๐Ÿ‡ฐ DA", "da"),
51
- ("๐Ÿ‡ซ๐Ÿ‡ฎ FI", "fi"),
52
- ("๐Ÿ‡ญ๐Ÿ‡บ HU", "hu"),
53
- ("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
54
- ("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
55
- ("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
56
- ("๐Ÿ‡บ๐Ÿ‡ฆ UA", "uk"),
57
- ("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
58
- ("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
59
- ("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
60
- ("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
61
- ("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
62
- ("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
63
- ("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
64
- ("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
65
- ("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
66
- ("Swahili", "sw"),
67
- ("Hausa", "ha"),
68
- ("Wolof", "wo"),
69
- ]
70
-
71
- # Translated from English by DeepMind's Gemini Pro
72
- default_text = {
73
- "ar": "ู‡ุฐุง ู‡ูˆ ุตูˆุชูŠ.",
74
- "da": "Sรฅdan lyder min stemme.",
75
- "de": "So klingt meine Stimme.",
76
- "el": "ฮˆฯ„ฯƒฮน ฮฑฮบฮฟฯฮณฮตฯ„ฮฑฮน ฮท ฯ†ฯ‰ฮฝฮฎ ฮผฮฟฯ….",
77
- "en": "This is what my voice sounds like.",
78
- "es": "Asรญ suena mi voz.",
79
- "fi": "Nรคin รครคneni kuulostaa.",
80
- "fr": "Voici ร  quoi ressemble ma voix.",
81
- "ha": "Wannan ne muryata ke.",
82
- "hi": "เคฏเคน เคฎเฅ‡เคฐเฅ€ เค†เคตเคพเคœเคผ เค•เฅˆเคธเฅ€ เคฒเค—เคคเฅ€ เคนเฅˆเฅค",
83
- "hu": "รgy hangzik a hangom.",
84
- "it": "Cosรฌ suona la mia voce.",
85
- "jp": "ใ“ใ‚ŒใŒ็งใฎๅฃฐใงใ™ใ€‚",
86
- "ko": "์—ฌ๊ธฐ ์ œ ๋ชฉ์†Œ๋ฆฌ๊ฐ€ ์–ด๋–ค์ง€ ๋“ค์–ด๋ณด์„ธ์š”.",
87
- "la": "Haec est vox mea sonans.",
88
- "nl": "Dit is hoe mijn stem klinkt.",
89
- "pl": "Tak brzmi mรณj gล‚os.",
90
- "pt": "ร‰ assim que minha voz soa.",
91
- "ro": "Aศ™a sunฤƒ vocea mea.",
92
- "ru": "ะ’ะพั‚ ะบะฐะบ ะทะฒัƒั‡ะธั‚ ะผะพะน ะณะพะปะพั.",
93
- "sv": "Sรฅhรคr lรฅter min rรถst.",
94
- "sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
95
- "tr": "Benim sesimin sesi bรถyle.",
96
- "uk": "ะžััŒ ัะบ ะทะฒัƒั‡ะธั‚ัŒ ะผั–ะน ะณะพะปะพั.",
97
- "vi": "ฤรขy lร  giแปng nรณi cแปงa tรดi.",
98
- "wo": "Ndox li neen xewnaal ma.",
99
- "yo": "รŒyรญ ni ohรนn mi ล„lรก.",
100
- "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
101
- }
102
-
103
- def run_xvaserver():
104
- # start the process without waiting for a response
105
- print('Running xVAServer subprocess...\n')
106
- xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')
107
-
108
- # Wait for a moment to ensure the server starts up
109
- time.sleep(10)
110
-
111
- # Check if the server is running
112
- if xvaserver.poll() is not None:
113
- print("Web server failed to start.")
114
- sys.exit(0)
115
-
116
- # contact local xVASynth server
117
- print('Attempting to connect to xVASynth...')
118
- try:
119
- response = requests.get('http://0.0.0.0:8008')
120
- response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
121
- except requests.exceptions.RequestException as err:
122
- print('Failed to connect!')
123
- return
124
-
125
- print('xVAServer running on port 8008')
126
-
127
- # load default model
128
- load_model("ccby_nvidia_hifi_6671_M")
129
-
130
- # Wait for the process to exit
131
- xvaserver.wait()
132
-
133
  def load_model(voice_model_name):
134
  model_path = models_path + voice_model_name
135
 
@@ -160,413 +50,115 @@ def load_model(voice_model_name):
160
 
161
  return embs
162
 
163
- def predict(
164
- input_text,
165
- voice,
166
- lang,
167
- pacing,
168
- pitch,
169
- energy,
170
- anger,
171
- happy,
172
- sad,
173
- surprise,
174
- use_deepmoji
175
- ):
176
- # grab only the first 1000 characters
177
- input_text = input_text[:1000]
178
-
179
- # load voice model if not the current model
180
- if (current_voice_model != voice):
181
- base_speaker_emb = load_model(voice)
182
 
183
- model_type = 'xVAPitch'
184
- pace = pacing if pacing else 1.0
185
- save_path = '/tmp/xvapitch_audio_sample.wav'
186
- language = lang
187
- use_sr = 0
188
- use_cleanup = 0
189
-
190
- pluginsContext = {}
191
- pluginsContext["mantella_settings"] = {
192
- "emAngry": (anger if anger > 0 else 0),
193
- "emHappy": (happy if happy > 0 else 0),
194
- "emSad": (sad if sad > 0 else 0),
195
- "emSurprise": (surprise if surprise > 0 else 0),
196
- "run_model": use_deepmoji
197
- }
198
-
199
-
200
- data = {
201
- 'pluginsContext': json.dumps(pluginsContext),
202
- 'modelType': model_type,
203
- # pad with whitespaces as a workaround to avoid cutoffs
204
- 'sequence': input_text.center(len(input_text) + 2, ' '),
205
- 'pace': pace,
206
- 'outfile': save_path,
207
- 'vocoder': 'n/a',
208
- 'base_lang': language,
209
- 'base_emb': base_speaker_emb,
210
- 'useSR': use_sr,
211
- 'useCleanup': use_cleanup,
212
- }
213
-
214
- print('Synthesizing...')
215
- try:
216
- json_data = xvaserver.synthesize(data)
217
- # response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
218
- # response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
219
- # json_data = json.loads(response.text)
220
- except requests.exceptions.RequestException as err:
221
- print('FAILED to synthesize: {err}')
222
- save_path = ''
223
- response = {'text': '{"message": "Failed"}'}
224
- json_data = {
225
- 'arpabet': ['Failed'],
226
- 'durations': [0],
227
- 'em_anger': anger,
228
- 'em_happy': happy,
229
- 'em_sad': sad,
230
- 'em_surprise': surprise,
231
  }
232
 
233
- # print('server.log contents:')
234
- # with open('resources/app/server.log', 'r') as f:
235
- # print(f.read())
236
-
237
- arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
238
- arpabet_symbols = json_data['arpabet'].split('|')
239
- utter_time = 0
240
- for symb_i in range(len(json_data['durations'])):
241
- # skip PAD symbol
242
- if (arpabet_symbols[symb_i] == '<PAD>'):
243
- continue
244
-
245
- length = float(json_data['durations'][symb_i])
246
- arpa_length = str(round(length/2, 1))
247
- arpabet_html += '<strong\
248
- class="arpabet"\
249
- style="padding: 0 '\
250
- + str(arpa_length)\
251
- +'em"'\
252
- +f" title=\"{utter_time} + {length}\""\
253
- +'>'\
254
- + arpabet_symbols[symb_i]\
255
- + '</strong> '
256
- utter_time += round(length, 1)
257
 
258
- return [
259
- save_path,
260
- arpabet_html,
261
- round(json_data['em_angry'][0], 2),
262
- round(json_data['em_happy'][0], 2),
263
- round(json_data['em_sad'][0], 2),
264
- round(json_data['em_surprise'][0], 2),
265
- json_data
266
- ]
267
-
268
- input_textbox = gr.Textbox(
269
- label="Input Text",
270
- value="This is what my voice sounds like.",
271
- info="Also accepts ARPAbet symbols placed within {} brackets.",
272
- lines=1,
273
- max_lines=5,
274
- autofocus=True
275
- )
276
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
277
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
278
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
279
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
280
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
281
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
282
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
283
- voice_radio = gr.Radio(
284
- voice_models,
285
- value="ccby_nvidia_hifi_6671_M",
286
- label="Voice",
287
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
288
- )
289
-
290
- def set_default_text(lang, deepmoji_checked):
291
- # DeepMoji only works on English Text
292
- # checkbox_enabled = True
293
- # if lang != 'en':
294
- # checkbox_enabled = False
295
-
296
- if lang == 'en':
297
- checkbox_enabled = gr.Checkbox(
298
- label="Use DeepMoji",
299
- info="Auto adjust emotional values",
300
- value=deepmoji_checked,
301
- interactive=True
302
- )
303
- else:
304
- checkbox_enabled = gr.Checkbox(
305
- label="Use DeepMoji",
306
- info="Works only with English!",
307
- value=False,
308
- interactive=False
309
- )
310
-
311
- return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
312
-
313
- en_examples = [
314
- "This is what my voice sounds like.",
315
- "If there is anything else you need, feel free to ask.",
316
- "Amazing! Could you do that again?",
317
- "Why, I would be more than happy to help you!",
318
- "That was unexpected.",
319
- "How dare you! . You have no right.",
320
- "Ahh, well, you see. There is more to it.",
321
- "I can't believe she is gone.",
322
- "Stay out of my way!!!",
323
- # ARPAbet example
324
- "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
325
- ]
326
-
327
- def set_example_as_input(example_text):
328
- return example_text
329
-
330
- def reset_em_sliders(
331
- deepmoji_enabled,
332
- anger,
333
- happy,
334
- sad,
335
- surprise
336
- ):
337
- if (deepmoji_enabled):
338
- return (0, 0, 0, 0)
339
- else:
340
- return (
341
- anger,
342
- happy,
343
- sad,
344
- surprise
345
- )
346
-
347
- def set_default_audio(voice_id):
348
- return models_path + voice_id + '.wav'
349
-
350
- def toggle_deepmoji(
351
- checked,
352
- anger,
353
- happy,
354
- sad,
355
- surprise
356
- ):
357
- if checked:
358
- return (0, 0, 0, 0)
359
- else:
360
- return (
361
- anger,
362
- happy,
363
- sad,
364
- surprise
365
- )
366
-
367
- language_radio = gr.Radio(
368
- languages,
369
- value="en",
370
- label="Language",
371
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
372
- )
373
-
374
- _DESCRIPTION = '''
375
- <div>
376
- <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
377
- <a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.3k-blue?logo=nexusmods'/></a>
378
- <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
379
- <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
380
- </div>
381
- '''
382
-
383
- with gr.Blocks(css=".arpabet {display: inline-block; background-color: gray; border-radius: 5px; font-size: 120%; margin: 0.1em 0}") as demo:
384
- gr.Markdown("# xVASynth TTS")
385
-
386
- gr.HTML(label="description", value=_DESCRIPTION)
387
-
388
- with gr.Row(): # Main row for inputs and language selection
389
- with gr.Column(): # Input column
390
- input_textbox = gr.Textbox(
391
- label="Input Text",
392
- value="This is what my voice sounds like.",
393
- info="Also accepts ARPAbet symbols placed within {} brackets.",
394
- lines=1,
395
- max_lines=5,
396
- autofocus=True
397
- )
398
- language_radio = gr.Radio(
399
- languages,
400
- value="en",
401
- label="Language",
402
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
403
- )
404
- with gr.Row():
405
- with gr.Column():
406
- en_examples_dropdown = gr.Dropdown(
407
- en_examples,
408
- value=en_examples[0],
409
- label="Example dropdown",
410
- show_label=False,
411
- info="English Examples",
412
- visible=(language_radio.value == 'en')
413
- )
414
- with gr.Column():
415
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
416
- with gr.Column(): # Control column
417
- voice_radio = gr.Radio(
418
- voice_models,
419
- value="ccby_nvidia_hifi_6671_M",
420
- label="Voice",
421
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
422
- )
423
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
424
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
425
- with gr.Row(): # Main row for inputs and language selection
426
- with gr.Column(): # Input column
427
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
428
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
429
- with gr.Column(): # Input column
430
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
431
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Can oversaturate Happiness")
432
- deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
433
-
434
- # Event handling using click
435
- btn = gr.Button("Generate", variant="primary")
436
-
437
- with gr.Row(): # Main row for inputs and language selection
438
- with gr.Column(): # Input column
439
- output_wav = gr.Audio(
440
- label="22kHz audio output (autoplay enabled)",
441
- type="filepath",
442
- editable=False,
443
- autoplay=True
444
- )
445
- with gr.Column(): # Input column
446
- output_arpabet = gr.HTML(label="ARPAbet")
447
-
448
- btn.click(
449
- fn=predict,
450
- inputs=[
451
- input_textbox,
452
- voice_radio,
453
- language_radio,
454
- pacing_slider,
455
- pitch_slider,
456
- energy_slider,
457
- anger_slider,
458
- happy_slider,
459
- sad_slider,
460
- surprise_slider,
461
- deepmoji_checkbox
462
- ],
463
- outputs=[
464
- output_wav,
465
- output_arpabet,
466
- anger_slider,
467
- happy_slider,
468
- sad_slider,
469
- surprise_slider,
470
- # xVAServer response
471
- gr.Textbox(visible=False)
472
- ]
473
- )
474
- input_textbox.submit(
475
- fn=predict,
476
- inputs=[
477
- input_textbox,
478
- voice_radio,
479
- language_radio,
480
- pacing_slider,
481
- pitch_slider,
482
- energy_slider,
483
- anger_slider,
484
- happy_slider,
485
- sad_slider,
486
- surprise_slider,
487
- deepmoji_checkbox
488
- ],
489
- outputs=[
490
- output_wav,
491
- output_arpabet,
492
- anger_slider,
493
- happy_slider,
494
- sad_slider,
495
- surprise_slider,
496
- # xVAServer response
497
- gr.Textbox(visible=False)
498
- ]
499
- )
500
-
501
- language_radio.change(
502
- set_default_text,
503
- inputs=[language_radio, deepmoji_checkbox],
504
- outputs=[input_textbox, deepmoji_checkbox]
505
- )
506
-
507
- en_examples_dropdown.change(
508
- set_example_as_input,
509
- inputs=[en_examples_dropdown],
510
- outputs=[input_textbox]
511
- )
512
-
513
- deepmoji_checkbox.change(
514
- toggle_deepmoji,
515
- inputs=[
516
- deepmoji_checkbox,
517
- anger_slider,
518
- happy_slider,
519
- sad_slider,
520
- surprise_slider
521
- ],
522
- outputs=[
523
- anger_slider,
524
- happy_slider,
525
- sad_slider,
526
- surprise_slider
527
- ]
528
- )
529
-
530
- input_textbox.change(
531
- reset_em_sliders,
532
- inputs=[
533
- deepmoji_checkbox,
534
- anger_slider,
535
- happy_slider,
536
- sad_slider,
537
- surprise_slider
538
- ],
539
- outputs=[
540
- anger_slider,
541
- happy_slider,
542
- sad_slider,
543
- surprise_slider
544
- ]
545
- )
546
 
547
- voice_radio.change(
548
- reset_em_sliders,
549
- inputs=[
550
- deepmoji_checkbox,
551
- anger_slider,
552
- happy_slider,
553
- sad_slider,
554
- surprise_slider
555
- ],
556
- outputs=[
557
- anger_slider,
558
- happy_slider,
559
- sad_slider,
560
- surprise_slider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  ]
562
- )
563
-
564
- voice_radio.change(
565
- set_default_audio,
566
- inputs=voice_radio,
567
- outputs=output_wav
568
- )
569
 
570
  if __name__ == "__main__":
571
  print('running custom Gradio interface')
572
- demo.launch()
 
 
1
  import os
2
  import sys
 
3
  import requests
4
  import json
 
 
5
  from huggingface_hub import HfApi
 
6
 
7
  # start xVASynth service (no HTTP)
8
  import resources.app.no_server as xvaserver
9
 
10
+ from gr_client import BlocksDemo
11
+
12
  # model
13
  hf_model_name = "Pendrokar/xvapitch_nvidia"
14
  model_repo = HfApi()
 
17
  hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
18
  models_path = hf_cache_models_path
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  current_voice_model = None
21
  base_speaker_emb = ''
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def load_model(voice_model_name):
24
  model_path = models_path + voice_model_name
25
 
 
50
 
51
  return embs
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ class LocalBlocksDemo(BlocksDemo):
55
+ def predict(
56
+ self,
57
+ input_text,
58
+ voice,
59
+ lang,
60
+ pacing,
61
+ pitch,
62
+ energy,
63
+ anger,
64
+ happy,
65
+ sad,
66
+ surprise,
67
+ use_deepmoji
68
+ ):
69
+ # grab only the first 1000 characters
70
+ input_text = input_text[:1000]
71
+
72
+ # load voice model if not the current model
73
+ if (current_voice_model != voice):
74
+ base_speaker_emb = load_model(voice)
75
+
76
+ model_type = 'xVAPitch'
77
+ pace = pacing if pacing else 1.0
78
+ save_path = '/tmp/xvapitch_audio_sample.wav'
79
+ language = lang
80
+ use_sr = 0
81
+ use_cleanup = 0
82
+
83
+ pluginsContext = {}
84
+ pluginsContext["mantella_settings"] = {
85
+ "emAngry": (anger if anger > 0 else 0),
86
+ "emHappy": (happy if happy > 0 else 0),
87
+ "emSad": (sad if sad > 0 else 0),
88
+ "emSurprise": (surprise if surprise > 0 else 0),
89
+ "run_model": use_deepmoji
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ data = {
94
+ 'pluginsContext': json.dumps(pluginsContext),
95
+ 'modelType': model_type,
96
+ # pad with whitespaces as a workaround to avoid cutoffs
97
+ 'sequence': input_text.center(len(input_text) + 2, ' '),
98
+ 'pace': pace,
99
+ 'outfile': save_path,
100
+ 'vocoder': 'n/a',
101
+ 'base_lang': language,
102
+ 'base_emb': base_speaker_emb,
103
+ 'useSR': use_sr,
104
+ 'useCleanup': use_cleanup,
105
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ print('Synthesizing...')
108
+ try:
109
+ json_data = xvaserver.synthesize(data)
110
+ # response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
111
+ # response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
112
+ # json_data = json.loads(response.text)
113
+ except requests.exceptions.RequestException as err:
114
+ print('FAILED to synthesize: {err}')
115
+ save_path = ''
116
+ response = {'text': '{"message": "Failed"}'}
117
+ json_data = {
118
+ 'arpabet': ['Failed'],
119
+ 'durations': [0],
120
+ 'em_anger': anger,
121
+ 'em_happy': happy,
122
+ 'em_sad': sad,
123
+ 'em_surprise': surprise,
124
+ }
125
+
126
+ # print('server.log contents:')
127
+ # with open('resources/app/server.log', 'r') as f:
128
+ # print(f.read())
129
+
130
+ arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
131
+ arpabet_symbols = json_data['arpabet'].split('|')
132
+ utter_time = 0
133
+ for symb_i in range(len(json_data['durations'])):
134
+ # skip PAD symbol
135
+ if (arpabet_symbols[symb_i] == '<PAD>'):
136
+ continue
137
+
138
+ length = float(json_data['durations'][symb_i])
139
+ arpa_length = str(round(length/2, 1))
140
+ arpabet_html += '<strong\
141
+ class="arpabet"\
142
+ style="padding: 0 '\
143
+ + str(arpa_length)\
144
+ +'em"'\
145
+ +f" title=\"{utter_time} + {length}\""\
146
+ +'>'\
147
+ + arpabet_symbols[symb_i]\
148
+ + '</strong> '
149
+ utter_time += round(length, 1)
150
+
151
+ return [
152
+ save_path,
153
+ arpabet_html,
154
+ round(json_data['em_angry'][0], 2),
155
+ round(json_data['em_happy'][0], 2),
156
+ round(json_data['em_sad'][0], 2),
157
+ round(json_data['em_surprise'][0], 2),
158
+ json_data
159
  ]
 
 
 
 
 
 
 
160
 
161
  if __name__ == "__main__":
162
  print('running custom Gradio interface')
163
+ demo = LocalBlocksDemo()
164
+ demo.block.launch()
gr_client.py CHANGED
@@ -1,34 +1,35 @@
1
  import os
2
- import sys
3
- import time
4
- import requests
5
  import json
6
- from huggingface_hub import hf_hub_download
7
  import gradio as gr
8
  from gradio_client import Client
9
 
10
  voice_models = [
11
- ("Male #6671", "ccby_nvidia_hifi_6671_M"),
12
- ("Male #6670", "ccby_nvidia_hifi_6670_M"),
 
 
 
13
  ("Male #9017", "ccby_nvidia_hifi_9017_M"),
14
  ("Male #6097", "ccby_nvidia_hifi_6097_M"),
15
- ("Female #92", "ccby_nvidia_hifi_92_F"),
16
- ("Female #11697", "ccby_nvidia_hifi_11697_F"),
17
- ("Female #12787", "ccby_nvidia_hifi_12787_F"),
18
- ("Female #11614", "ccby_nv_hifi_11614_F"),
19
  ("Female #8051", "ccby_nvidia_hifi_8051_F"),
 
20
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
21
  ]
22
- current_voice_model = None
23
 
24
  # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
25
  languages = [
26
- ("๐Ÿ‡ฌ๐Ÿ‡ง EN", "en"),
27
  ("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
28
  ("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
29
- ("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
 
 
 
30
  ("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
31
- ("๐Ÿ‡ต๐Ÿ‡น PT", "pt"),
 
32
  ("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
33
  ("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
34
  ("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
@@ -38,19 +39,17 @@ languages = [
38
  ("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
39
  ("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
40
  ("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
41
- ("๐Ÿ‡บ๐Ÿ‡ฆ UK", "uk"),
42
  ("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
43
  ("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
44
- ("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
45
  ("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
46
  ("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
47
- ("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
48
  ("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
49
  ("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
50
- ("HA", "ha"),
51
- ("SW", "sw"),
52
  ("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
53
- ("WO", "wo"),
 
 
54
  ]
55
 
56
  # Translated from English by DeepMind's Gemini Pro
@@ -85,112 +84,118 @@ default_text = {
85
  "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
86
  }
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- def predict(
90
- input_text,
91
- voice,
92
- lang,
93
- pacing,
94
- pitch,
95
- energy,
96
- anger,
97
- happy,
98
- sad,
99
- surprise,
100
- deepmoji_checked
101
- ):
102
- wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
103
- input_text, # str in 'Input Text' Textbox component
104
- voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
105
- lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
106
- pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
107
- pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
108
- energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
109
- anger, # float (numeric value between 0 and 1.0) in '๐Ÿ˜  Anger' Slider component
110
- happy, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ƒ Happiness' Slider component
111
- sad, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ญ Sadness' Slider component
112
- surprise, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ฎ Surprise' Slider component
113
- deepmoji_checked, # bool
114
- api_name="/predict"
115
- )
116
-
117
- json_data = json.loads(response.replace("'", '"'))
118
-
119
- arpabet_html = '<h6>ARPAbet & Durations</h6>'
120
- arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
121
- arpabet_nopad = json_data['arpabet'].split('|PAD|')
122
- arpabet_symbols = json_data['arpabet'].split('|')
123
- wpad_len = len(arpabet_symbols)
124
- nopad_len = len(arpabet_nopad)
125
- total_dur_length = 0
126
- for symb_i in range(wpad_len):
127
- if (arpabet_symbols[symb_i] == '<PAD>'):
128
- continue
129
- total_dur_length += float(json_data['durations'][symb_i])
130
-
131
- for symb_i in range(wpad_len):
132
- if (arpabet_symbols[symb_i] == '<PAD>'):
133
- continue
134
-
135
- arpabet_length = float(json_data['durations'][symb_i])
136
- cell_width = round(arpabet_length / total_dur_length * 100, 2)
137
- arpabet_html += '<td class="arpabet" style="width: '\
138
- + str(cell_width)\
139
- +'%">'\
140
- + arpabet_symbols[symb_i]\
141
- + '</td> '
142
- arpabet_html += '<tr></tbody></table>'
143
-
144
- return [
145
- wav_path,
146
- arpabet_html,
147
- round(json_data['em_angry'][0], 2),
148
- round(json_data['em_happy'][0], 2),
149
- round(json_data['em_sad'][0], 2),
150
- round(json_data['em_surprise'][0], 2)
151
- ]
152
-
153
- input_textbox = gr.Textbox(
154
- label="Input Text",
155
- value="This is what my voice sounds like.",
156
- info="Also accepts ARPAbet symbols placed within {} brackets.",
157
- lines=1,
158
- max_lines=5,
159
- autofocus=True
160
- )
161
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
162
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
163
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
164
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
165
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
166
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
167
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
168
- voice_radio = gr.Radio(
169
- voice_models,
170
- value="ccby_nvidia_hifi_6671_M",
171
- label="Voice",
172
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
173
- )
174
 
175
  def set_default_text(lang, deepmoji_checked):
 
 
 
 
 
 
 
 
 
176
  # DeepMoji only works on English Text
 
177
  if lang == 'en':
178
- checkbox_enabled = gr.Checkbox(
179
- label="Use DeepMoji",
180
- info="Auto adjust emotional values",
181
- value=deepmoji_checked,
182
- interactive=True
183
- )
184
  else:
185
- checkbox_enabled = gr.Checkbox(
186
- label="Use DeepMoji",
187
- info="Works only with English!",
188
- value=False,
189
- interactive=False
190
- )
191
 
192
- return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
193
 
 
194
  en_examples = [
195
  "This is what my voice sounds like.",
196
  "If there is anything else you need, feel free to ask.",
@@ -204,22 +209,37 @@ en_examples = [
204
  # ARPAbet example
205
  "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
206
  ]
 
 
 
 
 
 
 
 
207
 
208
  def set_example_as_input(example_text):
 
209
  return example_text
210
 
211
  def toggle_example_dropdown(lang):
 
 
212
  if lang == 'en':
213
- return gr.Dropdown(
214
- en_examples,
215
- value=en_examples[0],
216
- label="Example dropdown",
217
- show_label=False,
218
- info="English Examples",
219
- visible=True
220
- )
221
  else:
222
- return gr.Dropdown(visible=False)
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  def reset_em_sliders(
225
  deepmoji_enabled,
@@ -228,6 +248,7 @@ def reset_em_sliders(
228
  sad,
229
  surprise
230
  ):
 
231
  if (deepmoji_enabled):
232
  return (0, 0, 0, 0)
233
  else:
@@ -245,6 +266,7 @@ def toggle_deepmoji(
245
  sad,
246
  surprise
247
  ):
 
248
  if checked:
249
  return (0, 0, 0, 0)
250
  else:
@@ -255,183 +277,268 @@ def toggle_deepmoji(
255
  surprise
256
  )
257
 
258
- language_radio = gr.Radio(
259
- languages,
260
- value="en",
261
- label="Language",
262
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
263
- )
 
264
 
265
  _DESCRIPTION = '''
266
  <div>
267
  <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
 
268
  <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
269
- <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run</span>
270
  </div>
271
  '''
272
 
273
- with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
274
- gr.Markdown("# xVASynth TTS")
275
 
276
- gr.HTML(label="description", value=_DESCRIPTION)
 
 
 
 
 
277
 
278
- with gr.Row(): # Main row for inputs and language selection
279
- with gr.Column(): # Input column
280
- input_textbox = gr.Textbox(
281
- label="Input Text",
282
- value="This is what my voice sounds like.",
283
- info="Also accepts ARPAbet symbols placed within {} brackets.",
284
- lines=1,
285
- max_lines=5,
286
- autofocus=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  )
288
- language_radio = gr.Radio(
289
- languages,
290
- value="en",
291
- label="Language",
292
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  )
294
 
295
- with gr.Row():
296
- with gr.Column():
297
- en_examples_dropdown = gr.Dropdown(
298
- en_examples,
299
- value=en_examples[0],
300
- label="Example dropdown",
301
- show_label=False,
302
- info="English Examples"
303
- )
304
- with gr.Column():
305
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
306
- with gr.Column(): # Control column
307
- voice_radio = gr.Radio(
308
- voice_models,
309
- value="ccby_nvidia_hifi_6671_M",
310
- label="Voice",
311
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
312
  )
313
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
314
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
315
- with gr.Row(): # Main row for inputs and language selection
316
- with gr.Column(): # Input column
317
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
318
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
319
- with gr.Column(): # Input column
320
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
321
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Can oversaturate Happiness")
322
- deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
323
-
324
- # Event handling using click
325
- btn = gr.Button("Generate", variant="primary")
326
-
327
- # with gr.Row(): # Main row for inputs and language selection
328
- # with gr.Column(): # Input column
329
- output_wav = gr.Audio(
330
- label="22kHz audio output",
331
- type="filepath",
332
- editable=False,
333
- autoplay=True
334
- )
335
- # with gr.Column(): # Input column
336
- output_arpabet = gr.HTML(label="ARPAbet")
337
-
338
- btn.click(
339
- fn=predict,
340
- inputs=[
341
- input_textbox,
342
- voice_radio,
343
- language_radio,
344
- pacing_slider,
345
- pitch_slider,
346
- energy_slider,
347
- anger_slider,
348
- happy_slider,
349
- sad_slider,
350
- surprise_slider,
351
- deepmoji_checkbox
352
- ],
353
- outputs=[
354
- output_wav,
355
- output_arpabet,
356
- anger_slider,
357
- happy_slider,
358
- sad_slider,
359
- surprise_slider
360
- ]
361
- )
362
-
363
- language_radio.change(
364
- set_default_text,
365
- inputs=[language_radio, deepmoji_checkbox],
366
- outputs=[input_textbox, deepmoji_checkbox]
367
- )
368
-
369
- en_examples_dropdown.change(
370
- set_example_as_input,
371
- inputs=[en_examples_dropdown],
372
- outputs=[input_textbox]
373
- )
374
-
375
- language_radio.change(
376
- toggle_example_dropdown,
377
- inputs=language_radio,
378
- outputs=en_examples_dropdown
379
- )
380
-
381
- deepmoji_checkbox.change(
382
- toggle_deepmoji,
383
- inputs=[
384
- deepmoji_checkbox,
385
- anger_slider,
386
- happy_slider,
387
- sad_slider,
388
- surprise_slider
389
- ],
390
- outputs=[
391
- anger_slider,
392
- happy_slider,
393
- sad_slider,
394
- surprise_slider
395
- ]
396
- )
397
-
398
- input_textbox.change(
399
- reset_em_sliders,
400
- inputs=[
401
- deepmoji_checkbox,
402
- anger_slider,
403
- happy_slider,
404
- sad_slider,
405
- surprise_slider
406
- ],
407
- outputs=[
408
- anger_slider,
409
- happy_slider,
410
- sad_slider,
411
- surprise_slider
412
- ]
413
- )
414
-
415
- voice_radio.change(
416
- reset_em_sliders,
417
- inputs=[
418
- deepmoji_checkbox,
419
- anger_slider,
420
- happy_slider,
421
- sad_slider,
422
- surprise_slider
423
- ],
424
- outputs=[
425
- anger_slider,
426
- happy_slider,
427
- sad_slider,
428
- surprise_slider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  ]
430
- )
431
 
432
  if __name__ == "__main__":
433
  print('running Gradio interface')
434
- # gradio_app.launch()
435
  client = Client("Pendrokar/xVASynth")
436
 
437
- demo.launch()
 
 
1
  import os
 
 
 
2
  import json
 
3
  import gradio as gr
4
  from gradio_client import Client
5
 
6
  voice_models = [
7
+ ("๐Ÿ‘จโ€๐Ÿฆณ #6671", "ccby_nvidia_hifi_6671_M"),
8
+ ("๐Ÿ‘ฑโ€โ™€๏ธ ๐Ÿ‡ฌ๐Ÿ‡ง #92", "ccby_nvidia_hifi_92_F"),
9
+ ]
10
+ voice_models_more = [
11
+ ("๐Ÿง” #6670", "ccby_nvidia_hifi_6670_M"),
12
  ("Male #9017", "ccby_nvidia_hifi_9017_M"),
13
  ("Male #6097", "ccby_nvidia_hifi_6097_M"),
14
+ ("๐Ÿ‘ฉโ€๐Ÿฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
15
+ ("๐Ÿ‘ต #11614", "ccby_nv_hifi_11614_F"),
 
 
16
  ("Female #8051", "ccby_nvidia_hifi_8051_F"),
17
+ ("๐Ÿ‘ฉโ€๐Ÿฆณ #11697", "ccby_nvidia_hifi_11697_F"),
18
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
19
  ]
 
20
 
21
  # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
22
  languages = [
23
+ ("๐Ÿ‡บ๐Ÿ‡ธ EN", "en"),
24
  ("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
25
  ("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
26
+ ("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
27
+ ("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
28
+ ]
29
+ languages_more = [
30
  ("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
31
+ ("๐Ÿ‡ง๐Ÿ‡ท PT", "pt"),
32
+ ("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
33
  ("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
34
  ("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
35
  ("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
 
39
  ("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
40
  ("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
41
  ("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
42
+ ("๐Ÿ‡บ๐Ÿ‡ฆ UA", "uk"),
43
  ("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
44
  ("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
 
45
  ("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
46
  ("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
 
47
  ("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
48
  ("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
 
 
49
  ("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
50
+ ("Swahili", "sw"),
51
+ ("Hausa", "ha"),
52
+ ("Wolof", "wo"),
53
  ]
54
 
55
  # Translated from English by DeepMind's Gemini Pro
 
84
  "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
85
  }
86
 
87
+ # Component defaults
88
+ input_textbox_init = {
89
+ 'label': "Input Text",
90
+ 'value': "This is what my voice sounds like.",
91
+ 'info': "Also accepts ARPAbet symbols placed within {} brackets.",
92
+ 'lines': 1,
93
+ 'max_lines': 5,
94
+ 'autofocus': True,
95
+ }
96
+ pacing_slider_init = {
97
+ 'value': 1.0,
98
+ 'minimum': 0.5,
99
+ 'maximum': 2.0,
100
+ 'step': 0.1,
101
+ 'label': "Duration",
102
+ }
103
+ pitch_slider_init = {
104
+ 'minimum': 0,
105
+ 'maximum': 1.0,
106
+ 'value': 0.5,
107
+ 'step': 0.05,
108
+ 'label': "Pitch",
109
+ 'visible': False,
110
+ }
111
+ energy_slider_init = {
112
+ 'minimum': 0.1,
113
+ 'maximum': 1.0,
114
+ 'value': 1.0,
115
+ 'step': 0.05,
116
+ 'label': "Energy",
117
+ 'visible': False,
118
+ }
119
+ anger_slider_init = {
120
+ 'minimum': 0,
121
+ 'maximum': 1.0,
122
+ 'value': 0,
123
+ 'step': 0.05,
124
+ 'label': "๐Ÿ˜  Anger",
125
+ 'info': "Tread lightly beyond 0.9",
126
+ }
127
+ happy_slider_init = {
128
+ 'minimum': 0,
129
+ 'maximum': 1.0,
130
+ 'value': 0,
131
+ 'step': 0.05,
132
+ 'label': "๐Ÿ˜ƒ Happiness",
133
+ 'info': "Tread lightly beyond 0.7",
134
+ }
135
+ sad_slider_init = {
136
+ 'minimum': 0,
137
+ 'maximum': 1.0,
138
+ 'value': 0,
139
+ 'step': 0.05,
140
+ 'label': "๐Ÿ˜ญ Sadness",
141
+ 'info': "Duration increased when beyond 0.2",
142
+ }
143
+ surprise_slider_init = {
144
+ 'minimum': 0,
145
+ 'maximum': 1.0,
146
+ 'value': 0,
147
+ 'step': 0.05,
148
+ 'label': "๐Ÿ˜ฎ Surprise",
149
+ 'info': "Does not play well with Happiness with either being beyond 0.3",
150
+ }
151
+ voice_radio_init = {
152
+ 'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')],
153
+ 'value': "ccby_nvidia_hifi_6671_M",
154
+ 'label': "Voice",
155
+ 'info': "NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
156
+ }
157
+ deepmoji_checkbox_init = {
158
+ 'label': "Use DeepMoji",
159
+ 'info': "Auto adjust emotional values for English",
160
+ 'value': True,
161
+ 'interactive': True
162
+ }
163
+
164
+ def more_lang_options(lang):
165
+ # print('more_lang_options')
166
+ if lang != 'more':
167
+ return lang
168
 
169
+ radio_init = {**language_radio_init}
170
+ radio_init['choices'] = [*languages, *languages_more]
171
+ return gr.Radio(**radio_init)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def set_default_text(lang, deepmoji_checked):
174
+ # print('set_default_text')
175
+ textbox_init = {**input_textbox_init}
176
+ if lang == 'more':
177
+ textbox_init['value'] = default_text['en']
178
+ # return default_text['en'], deepmoji_checked
179
+ return gr.Textbox(**textbox_init), deepmoji_checked
180
+
181
+ textbox_init['value'] = default_text[lang]
182
+
183
  # DeepMoji only works on English Text
184
+ checkbox_init = {**deepmoji_checkbox_init}
185
  if lang == 'en':
186
+ checkbox_init['value'] = deepmoji_checked,
187
+ # checkbox_init['interactive'] = True
 
 
 
 
188
  else:
189
+ deepmoji_checked = False
190
+ # FIXME: event listener conflict with toggle_deepmoji
191
+ # checkbox_init['info'] = "Works only with English!",
192
+ # checkbox_init['value'] = False,
193
+ # checkbox_init['interactive'] = False
194
+ # gr.Checkbox(**checkbox_init)
195
 
196
+ return gr.Textbox(**textbox_init), deepmoji_checked
197
 
198
+ # examples component
199
  en_examples = [
200
  "This is what my voice sounds like.",
201
  "If there is anything else you need, feel free to ask.",
 
209
  # ARPAbet example
210
  "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
211
  ]
212
+ en_examples_dropdown_init = {
213
+ 'choices': en_examples,
214
+ 'value': en_examples[0],
215
+ 'label': "Example dropdown",
216
+ 'show_label': False,
217
+ 'info': "English Examples",
218
+ 'visible': True
219
+ }
220
 
221
  def set_example_as_input(example_text):
222
+ # print('set_example_as_input')
223
  return example_text
224
 
225
  def toggle_example_dropdown(lang):
226
+ # print('toggle_example_dropdown')
227
+ dropdown_init = {**en_examples_dropdown_init}
228
  if lang == 'en':
229
+ dropdown_init['visible'] = True
 
 
 
 
 
 
 
230
  else:
231
+ dropdown_init['visible'] = False
232
+
233
+ return gr.Dropdown(**dropdown_init)
234
+
235
+ def more_voice_options(voice):
236
+ # print('more_voice_options')
237
+ if voice != 'more':
238
+ return voice
239
+
240
+ radio_init = {**voice_radio_init}
241
+ radio_init['choices'] = [*voice_models, *voice_models_more]
242
+ return gr.Radio(**radio_init)
243
 
244
  def reset_em_sliders(
245
  deepmoji_enabled,
 
248
  sad,
249
  surprise
250
  ):
251
+ # print('reset_em_sliders')
252
  if (deepmoji_enabled):
253
  return (0, 0, 0, 0)
254
  else:
 
266
  sad,
267
  surprise
268
  ):
269
+ # print('toggle_deepmoji')
270
  if checked:
271
  return (0, 0, 0, 0)
272
  else:
 
277
  surprise
278
  )
279
 
280
+ # languages component
281
+ language_radio_init = {
282
+ 'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
283
+ 'value': "en",
284
+ 'label': "Language",
285
+ 'info': "Will be more monotone and have an English accent."
286
+ }
287
 
288
  _DESCRIPTION = '''
289
  <div>
290
  <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
291
+ <a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.4k-blue?logo=nexusmods'/></a>
292
  <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
293
+ <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
294
  </div>
295
  '''
296
 
 
 
297
 
298
+ class BlocksDemo:
299
+ def __init__(self):
300
+ with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
301
+ gr.Markdown("# xVASynth TTS")
302
+
303
+ gr.HTML(label="description", value=_DESCRIPTION)
304
 
305
+ with gr.Row(): # Main row for inputs and language selection
306
+ with gr.Column(): # Input column
307
+ input_textbox = gr.Textbox(**input_textbox_init)
308
+ language_radio = gr.Radio(**language_radio_init)
309
+
310
+ # remove autofocus
311
+ input_textbox_init['autofocus'] = False
312
+
313
+ with gr.Row():
314
+ with gr.Column():
315
+ en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
316
+ with gr.Column():
317
+ pacing_slider = gr.Slider(**pacing_slider_init)
318
+ with gr.Column(): # Control column
319
+ voice_radio = gr.Radio(**voice_radio_init)
320
+ pitch_slider = gr.Slider(**pitch_slider_init)
321
+ energy_slider = gr.Slider(**energy_slider_init)
322
+ with gr.Row(): # Main row for inputs and language selection
323
+ with gr.Column(): # Input column
324
+ anger_slider = gr.Slider(**anger_slider_init)
325
+ sad_slider = gr.Slider(**sad_slider_init)
326
+ with gr.Column(): # Input column
327
+ happy_slider = gr.Slider(**happy_slider_init)
328
+ surprise_slider = gr.Slider(**surprise_slider_init)
329
+ deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
330
+
331
+ # Event handling using click
332
+ btn = gr.Button("Generate", variant="primary")
333
+
334
+ # with gr.Row(): # Main row for inputs and language selection
335
+ # with gr.Column(): # Input column
336
+ output_wav = gr.Audio(
337
+ label="22kHz audio output",
338
+ type="filepath",
339
+ editable=False,
340
+ autoplay=True
341
  )
342
+ # with gr.Column(): # Input column
343
+ output_arpabet = gr.HTML(label="ARPAbet")
344
+
345
+ btn.click(
346
+ fn=self.predict,
347
+ inputs=[
348
+ input_textbox,
349
+ voice_radio,
350
+ language_radio,
351
+ pacing_slider,
352
+ pitch_slider,
353
+ energy_slider,
354
+ anger_slider,
355
+ happy_slider,
356
+ sad_slider,
357
+ surprise_slider,
358
+ deepmoji_checkbox
359
+ ],
360
+ outputs=[
361
+ output_wav,
362
+ output_arpabet,
363
+ anger_slider,
364
+ happy_slider,
365
+ sad_slider,
366
+ surprise_slider
367
+ ]
368
  )
369
 
370
+ # more languages option
371
+ language_radio.change(
372
+ more_lang_options,
373
+ inputs=language_radio,
374
+ outputs=language_radio,
375
+ trigger_mode='once',
376
+ show_progress='hidden',
 
 
 
 
 
 
 
 
 
 
377
  )
378
+
379
+ # more voices option
380
+ voice_radio.change(
381
+ more_voice_options,
382
+ inputs=voice_radio,
383
+ outputs=voice_radio,
384
+ trigger_mode='once',
385
+ show_progress='hidden',
386
+ queue=False,
387
+ )
388
+
389
+ # set default text
390
+ language_radio.change(
391
+ set_default_text,
392
+ inputs=[language_radio, deepmoji_checkbox],
393
+ outputs=[input_textbox, deepmoji_checkbox],
394
+ show_progress='hidden',
395
+ queue=False,
396
+ )
397
+
398
+ # toggle en examples
399
+ language_radio.change(
400
+ toggle_example_dropdown,
401
+ inputs=language_radio,
402
+ outputs=en_examples_dropdown,
403
+ show_progress='hidden',
404
+ queue=False,
405
+ )
406
+
407
+ en_examples_dropdown.change(
408
+ set_example_as_input,
409
+ inputs=[en_examples_dropdown],
410
+ outputs=[input_textbox],
411
+ show_progress='hidden',
412
+ queue=False,
413
+ )
414
+
415
+ deepmoji_checkbox.change(
416
+ toggle_deepmoji,
417
+ inputs=[
418
+ deepmoji_checkbox,
419
+ anger_slider,
420
+ happy_slider,
421
+ sad_slider,
422
+ surprise_slider
423
+ ],
424
+ outputs=[
425
+ anger_slider,
426
+ happy_slider,
427
+ sad_slider,
428
+ surprise_slider
429
+ ],
430
+ show_progress='hidden',
431
+ queue=False,
432
+ )
433
+
434
+ input_textbox.change(
435
+ reset_em_sliders,
436
+ inputs=[
437
+ deepmoji_checkbox,
438
+ anger_slider,
439
+ happy_slider,
440
+ sad_slider,
441
+ surprise_slider
442
+ ],
443
+ outputs=[
444
+ anger_slider,
445
+ happy_slider,
446
+ sad_slider,
447
+ surprise_slider
448
+ ],
449
+ show_progress='hidden',
450
+ queue=False,
451
+ )
452
+
453
+ voice_radio.change(
454
+ reset_em_sliders,
455
+ inputs=[
456
+ deepmoji_checkbox,
457
+ anger_slider,
458
+ happy_slider,
459
+ sad_slider,
460
+ surprise_slider
461
+ ],
462
+ outputs=[
463
+ anger_slider,
464
+ happy_slider,
465
+ sad_slider,
466
+ surprise_slider
467
+ ],
468
+ show_progress='hidden',
469
+ queue=False,
470
+ )
471
+
472
+ self.block = demo
473
+
474
+ def predict(
475
+ self,
476
+ input_text,
477
+ voice,
478
+ lang,
479
+ pacing,
480
+ pitch,
481
+ energy,
482
+ anger,
483
+ happy,
484
+ sad,
485
+ surprise,
486
+ deepmoji_checked
487
+ ):
488
+ wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
489
+ input_text, # str in 'Input Text' Textbox component
490
+ voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
491
+ lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
492
+ pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
493
+ pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
494
+ energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
495
+ anger, # float (numeric value between 0 and 1.0) in '๐Ÿ˜  Anger' Slider component
496
+ happy, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ƒ Happiness' Slider component
497
+ sad, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ญ Sadness' Slider component
498
+ surprise, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ฎ Surprise' Slider component
499
+ deepmoji_checked, # bool
500
+ api_name="/predict"
501
+ )
502
+
503
+ json_data = json.loads(response.replace("'", '"'))
504
+
505
+ arpabet_html = '<h6>ARPAbet & Durations</h6>'
506
+ arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
507
+ arpabet_nopad = json_data['arpabet'].split('|PAD|')
508
+ arpabet_symbols = json_data['arpabet'].split('|')
509
+ wpad_len = len(arpabet_symbols)
510
+ nopad_len = len(arpabet_nopad)
511
+ total_dur_length = 0
512
+ for symb_i in range(wpad_len):
513
+ if (arpabet_symbols[symb_i] == '<PAD>'):
514
+ continue
515
+ total_dur_length += float(json_data['durations'][symb_i])
516
+
517
+ for symb_i in range(wpad_len):
518
+ if (arpabet_symbols[symb_i] == '<PAD>'):
519
+ continue
520
+
521
+ arpabet_length = float(json_data['durations'][symb_i])
522
+ cell_width = round(arpabet_length / total_dur_length * 100, 2)
523
+ arpabet_html += '<td class="arpabet" style="width: '\
524
+ + str(cell_width)\
525
+ +'%">'\
526
+ + arpabet_symbols[symb_i]\
527
+ + '</td> '
528
+ arpabet_html += '<tr></tbody></table>'
529
+
530
+ return [
531
+ wav_path,
532
+ arpabet_html,
533
+ round(json_data['em_angry'][0], 2),
534
+ round(json_data['em_happy'][0], 2),
535
+ round(json_data['em_sad'][0], 2),
536
+ round(json_data['em_surprise'][0], 2)
537
  ]
 
538
 
539
  if __name__ == "__main__":
540
  print('running Gradio interface')
 
541
  client = Client("Pendrokar/xVASynth")
542
 
543
+ demo = BlocksDemo()
544
+ demo.block.launch()