Pendrokar commited on
Commit
66182b2
·
1 Parent(s): eef0c06

Gradio Client TTS support

Browse files
Files changed (3) hide show
  1. app/config.py +2 -1
  2. app/models.py +333 -12
  3. app/synth.py +181 -42
app/config.py CHANGED
@@ -15,7 +15,8 @@ MIN_SAMPLE_TXT_LENGTH = 10 # Minimum text length (characters)
15
 
16
  DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
17
 
18
- ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
 
19
 
20
  SYNC_DB = True # Sync DB to HF dataset?
21
  DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing
 
15
 
16
  DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
17
 
18
+ ROUTER_ID = "Pendrokar/xVASynth-TTS" # You should use a router space to route TTS models to avoid exposing your API keys!
19
+ # ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
20
 
21
  SYNC_DB = True # Sync DB to HF dataset?
22
  DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing
app/models.py CHANGED
@@ -1,21 +1,342 @@
 
 
1
  # Models to include in the leaderboard, only include models that users can vote on
2
  AVAILABLE_MODELS = {
3
- 'XTTSv2': 'xtts',
4
  # 'WhisperSpeech': 'whisperspeech',
5
- 'ElevenLabs': 'eleven',
6
  # 'OpenVoice': 'openvoice',
7
- 'OpenVoice V2': 'openvoicev2',
8
- 'Play.HT 2.0': 'playht',
9
- 'Play.HT 3.0 Mini': 'playht3',
10
  # 'MetaVoice': 'metavoice',
11
- 'MeloTTS': 'melo',
12
- 'StyleTTS 2': 'styletts2',
13
- 'GPT-SoVITS': 'sovits',
14
  # 'Vokan TTS': 'vokan',
15
- 'VoiceCraft 2.0': 'voicecraft',
16
- 'Parler TTS': 'parler',
17
- 'Parler TTS Large': 'parlerlarge',
18
- 'Fish Speech v1.4': 'fish',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
 
21
 
 
1
+ from gradio_client import handle_file
2
+
3
  # Models to include in the leaderboard, only include models that users can vote on
4
  AVAILABLE_MODELS = {
5
+ # 'XTTSv2': 'xtts',
6
  # 'WhisperSpeech': 'whisperspeech',
7
+ # 'ElevenLabs': 'eleven',
8
  # 'OpenVoice': 'openvoice',
9
+ # 'OpenVoice V2': 'openvoicev2',
10
+ # 'Play.HT 2.0': 'playht',
11
+ # 'Play.HT 3.0 Mini': 'playht3',
12
  # 'MetaVoice': 'metavoice',
13
+ # 'MeloTTS': 'melo',
14
+ # 'StyleTTS 2': 'styletts2',
15
+ # 'GPT-SoVITS': 'sovits',
16
  # 'Vokan TTS': 'vokan',
17
+ # 'VoiceCraft 2.0': 'voicecraft',
18
+ # 'Parler TTS': 'parler',
19
+ # 'Parler TTS Large': 'parlerlarge',
20
+ # 'Fish Speech v1.4': 'fish',
21
+
22
+ # HF Gradio Spaces: # <works with gradio version #>
23
+ # gravio version that works with most spaces: 4.29
24
+ # 'coqui/xtts': 'coqui/xtts', # 4.29 4.32
25
+ # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
26
+ # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
27
+ # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
28
+ # 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
29
+ 'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
30
+ # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
31
+ # 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
32
+ # 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1
33
+
34
+ # E2 & F5 TTS
35
+ # F5 model
36
+ # 'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
37
+
38
+ # # Parler
39
+ # Parler Large model
40
+ # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
41
+ # Parler Mini model
42
+ # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
43
+ # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
44
+ # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
45
+
46
+ # # Microsoft Edge TTS
47
+ # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
48
+
49
+ # IMS-Toucan
50
+ # 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
51
+
52
+ # HF TTS w issues
53
+ 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
54
+ # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
55
+ # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
56
+ # 'suno/bark': '3#0', # Hallucinates
57
+ # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
58
+ # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
59
+ # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
60
+ # 'pytorch/Tacotron2': '0#0', # old gradio
61
+ }
62
+
63
+ HF_SPACES = {
64
+ # XTTS v2
65
+ 'coqui/xtts': {
66
+ 'name': 'XTTS v2',
67
+ 'function': '1',
68
+ 'text_param_index': 0,
69
+ 'return_audio_index': 1,
70
+ 'series': 'XTTS',
71
+ },
72
+ # WhisperSpeech
73
+ 'collabora/WhisperSpeech': {
74
+ 'name': 'WhisperSpeech',
75
+ 'function': '/whisper_speech_demo',
76
+ 'text_param_index': 0,
77
+ 'return_audio_index': 0,
78
+ 'series': 'WhisperSpeech',
79
+ },
80
+ # OpenVoice (MyShell.ai)
81
+ 'myshell-ai/OpenVoice': {
82
+ 'name':'OpenVoice',
83
+ 'function': '1',
84
+ 'text_param_index': 0,
85
+ 'return_audio_index': 1,
86
+ 'series': 'OpenVoice',
87
+ },
88
+ # OpenVoice v2 (MyShell.ai)
89
+ 'myshell-ai/OpenVoiceV2': {
90
+ 'name':'OpenVoice v2',
91
+ 'function': '1',
92
+ 'text_param_index': 0,
93
+ 'return_audio_index': 1,
94
+ 'series': 'OpenVoice',
95
+ },
96
+ # MetaVoice
97
+ 'mrfakename/MetaVoice-1B-v0.1': {
98
+ 'name':'MetaVoice-1B',
99
+ 'function': '/tts',
100
+ 'text_param_index': 0,
101
+ 'return_audio_index': 0,
102
+ 'series': 'MetaVoice-1B',
103
+ },
104
+ # xVASynth (CPU)
105
+ 'Pendrokar/xVASynth-TTS': {
106
+ 'name': 'xVASynth v3',
107
+ 'function': '/predict',
108
+ 'text_param_index': 0,
109
+ 'return_audio_index': 0,
110
+ 'series': 'xVASynth',
111
+ },
112
+ # CoquiTTS (CPU)
113
+ 'coqui/CoquiTTS': {
114
+ 'name': 'CoquiTTS',
115
+ 'function': '0',
116
+ 'text_param_index': 0,
117
+ 'return_audio_index': 0,
118
+ 'series': 'CoquiTTS',
119
+ },
120
+ # HierSpeech_TTS
121
+ 'LeeSangHoon/HierSpeech_TTS': {
122
+ 'name': 'HierSpeech++',
123
+ 'function': '/predict',
124
+ 'text_param_index': 0,
125
+ 'return_audio_index': 0,
126
+ 'series': 'HierSpeech++',
127
+ },
128
+ # MeloTTS (MyShell.ai)
129
+ 'mrfakename/MeloTTS': {
130
+ 'name': 'MeloTTS',
131
+ 'function': '/synthesize',
132
+ 'text_param_index': 0,
133
+ 'return_audio_index': 0,
134
+ 'series': 'MeloTTS',
135
+ },
136
+
137
+ # Parler
138
+ 'parler-tts/parler_tts': {
139
+ 'name': 'Parler Mini',
140
+ 'function': '/gen_tts',
141
+ 'text_param_index': 0,
142
+ 'return_audio_index': 0,
143
+ 'is_zero_gpu_space': True,
144
+ 'series': 'Parler',
145
+ },
146
+ # Parler Mini
147
+ # 'parler-tts/parler_tts': {
148
+ # 'name': 'Parler Large',
149
+ # 'function': '/gen_tts',
150
+ # 'text_param_index': 0,
151
+ # 'return_audio_index': 0,
152
+ # 'is_zero_gpu_space': True,
153
+ # 'series': 'Parler',
154
+ # },
155
+ # Parler Mini which using Expresso dataset
156
+ 'parler-tts/parler-tts-expresso': {
157
+ 'name': 'Parler Mini Expresso',
158
+ 'function': '/gen_tts',
159
+ 'text_param_index': 0,
160
+ 'return_audio_index': 0,
161
+ 'is_zero_gpu_space': True,
162
+ 'series': 'Parler',
163
+ },
164
+
165
+ # Microsoft Edge TTS
166
+ 'innoai/Edge-TTS-Text-to-Speech': {
167
+ 'name': 'Edge TTS',
168
+ 'function': '/predict',
169
+ 'text_param_index': 0,
170
+ 'return_audio_index': 0,
171
+ 'is_proprietary': True,
172
+ 'series': 'Edge TTS',
173
+ },
174
+
175
+ # Fish Speech
176
+ 'fishaudio/fish-speech-1': {
177
+ 'name': 'Fish Speech',
178
+ 'function': '/inference_wrapper',
179
+ 'text_param_index': 0,
180
+ 'return_audio_index': 1,
181
+ 'series': 'Fish Speech',
182
+ },
183
+
184
+ # E2/F5 TTS
185
+ 'mrfakename/E2-F5-TTS': {
186
+ 'name': 'F5 of E2 TTS',
187
+ 'function': '/infer',
188
+ 'text_param_index': 2,
189
+ 'return_audio_index': 0,
190
+ 'is_zero_gpu_space': True,
191
+ 'series': 'E2/F5 TTS',
192
+ },
193
+
194
+ # IMS-Toucan
195
+ 'Flux9665/MassivelyMultilingualTTS': {
196
+ 'name': 'IMS-Toucan',
197
+ 'function': "/predict",
198
+ 'text_param_index': 0,
199
+ 'return_audio_index': 0,
200
+ 'series': 'IMS-Toucan',
201
+ },
202
+
203
+ # IMS-Toucan English non-artificial
204
+ 'Flux9665/EnglishToucan': {
205
+ 'name': 'IMS-Toucan EN',
206
+ 'function': "/predict",
207
+ 'text_param_index': 0,
208
+ 'return_audio_index': 0,
209
+ 'series': 'IMS-Toucan',
210
+ },
211
+
212
+ # StyleTTS v2
213
+ 'Pendrokar/style-tts-2': {
214
+ 'name': 'StyleTTS v2',
215
+ 'function': '/synthesize',
216
+ 'text_param_index': 0,
217
+ 'return_audio_index': 0,
218
+ 'is_zero_gpu_space': True,
219
+ 'series': 'StyleTTS',
220
+ },
221
+ }
222
+
223
+ # for zero-shot TTS - voice sample used by XTTS (11 seconds)
224
+ DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
225
+ DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
226
+ DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
227
+
228
+ OVERRIDE_INPUTS = {
229
+ 'coqui/xtts': {
230
+ 1: 'en',
231
+ 2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
232
+ 3: None, # mic voice sample
233
+ 4: False, #use_mic
234
+ 5: False, #cleanup_reference
235
+ 6: False, #auto_detect
236
+ },
237
+ 'collabora/WhisperSpeech': {
238
+ 1: DEFAULT_VOICE_SAMPLE, # voice sample
239
+ 2: DEFAULT_VOICE_SAMPLE, # voice sample URL
240
+ 3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
241
+ },
242
+ 'myshell-ai/OpenVoice': {
243
+ 1: 'default', # style
244
+ 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
245
+ },
246
+ 'myshell-ai/OpenVoiceV2': {
247
+ 1: 'en_us', # style
248
+ 2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
249
+ },
250
+ 'PolyAI/pheme': {
251
+ 1: 'YOU1000000044_S0000798', # voice
252
+ 2: 210,
253
+ 3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
254
+ },
255
+ 'Pendrokar/xVASynth-TTS': {
256
+ 1: 'x_ex04', #fine-tuned voice model name
257
+ 3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
258
+ },
259
+ 'suno/bark': {
260
+ 1: 'Speaker 3 (en)', # voice
261
+ },
262
+ 'amphion/Text-to-Speech': {
263
+ 1: 'LikeManyWaters', # voice
264
+ },
265
+ 'LeeSangHoon/HierSpeech_TTS': {
266
+ 1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
267
+ 2: 0.333,
268
+ 3: 0.333,
269
+ 4: 1,
270
+ 5: 1,
271
+ 6: 0,
272
+ 7: 1111,
273
+ },
274
+ 'Manmay/tortoise-tts': {
275
+ 1: None, # text-from-file
276
+ 2: 'angie', # voice
277
+ 3: 'disabled', # second voice for a dialogue
278
+ 4: 'No', # split by newline
279
+ },
280
+ 'mrfakename/MeloTTS': {
281
+ 1: 'EN-Default', # speaker; DEFAULT_VOICE_SAMPLE=EN-Default
282
+ 2: 1, # speed
283
+ 3: 'EN', # language
284
+ },
285
+ 'mrfakename/MetaVoice-1B-v0.1': {
286
+ 1: 5, # float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
287
+ 2: 5, # float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
288
+ 3: "Preset voices", # Literal['Preset voices', 'Upload target voice'] in 'Choose voice' Radio component
289
+ 4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
290
+ 5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
291
+ },
292
+ 'parler-tts/parler_tts': {
293
+ 1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
294
+ },
295
+ 'parler-tts/parler-tts-expresso': {
296
+ 1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
297
+ },
298
+ 'innoai/Edge-TTS-Text-to-Speech': {
299
+ 1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
300
+ 2: 0, # pace rate
301
+ 3: 0, # pitch
302
+ },
303
+
304
+ 'fishaudio/fish-speech-1': {
305
+ 1: True, # enable_reference_audio
306
+ 2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
307
+ 3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
308
+ 4: 0, # max_new_tokens
309
+ 5: 200, # chunk_length
310
+ 6: 0.7, # top_p
311
+ 7: 1.2, # repetition_penalty
312
+ 8: 0.7, # temperature
313
+ 9: 1, # batch_infer_num
314
+ 10: False, # if_load_asr_model
315
+ },
316
+
317
+ 'mrfakename/E2-F5-TTS': {
318
+ 0: DEFAULT_VOICE_SAMPLE, # voice sample
319
+ 1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
320
+ 3: "F5-TTS", # model
321
+ 4: False, # cleanup silence
322
+ },
323
+
324
+ # IMS-Toucan
325
+ 'Flux9665/MassivelyMultilingualTTS': {
326
+ 1: "English (eng)", #language
327
+ 2: 0.6, #prosody_creativity
328
+ 3: 1, #duration_scaling_factor
329
+ 4: 41, #voice_seed
330
+ 5: -7.5, #emb1
331
+ 6: None, #reference_audio
332
+ },
333
+
334
+ # StyleTTS 2
335
+ 'Pendrokar/style-tts-2': {
336
+ 1: "f-us-2", #voice
337
+ 2: 'en-us', # lang
338
+ 3: 8, # lngsteps
339
+ },
340
  }
341
 
342
 
app/synth.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from .models import *
2
  from .utils import *
3
  from .config import *
@@ -9,6 +10,9 @@ import random, os, threading, tempfile
9
  from langdetect import detect
10
  from .vote import log_text
11
 
 
 
 
12
  def random_m():
13
  return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
14
 
@@ -17,7 +21,7 @@ def check_toxicity(text):
17
  return False
18
  return toxicity.predict(text)['toxicity'] > 0.8
19
 
20
- def synthandreturn(text):
21
  text = text.strip()
22
  if len(text) > MAX_SAMPLE_TXT_LENGTH:
23
  raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
@@ -34,51 +38,171 @@ def synthandreturn(text):
34
  raise gr.Error(f'You did not enter any text')
35
  # Check language
36
  try:
37
- if not detect(text) == "en":
 
 
 
38
  gr.Warning('Warning: The input text may not be in English')
39
  except:
40
  pass
41
  # Get two random models
42
- mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
43
- log_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  print("[debug] Using", mdl1, mdl2)
45
- def predict_and_update_result(text, model, result_storage):
 
 
46
  try:
47
- if model in AVAILABLE_MODELS:
48
- result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
49
- else:
50
- result = router.predict(text, model.lower(), api_name="/synthesize")
51
  except:
52
- raise gr.Error('Unable to call API, please try again :)')
53
- print('Done with', model)
54
- # try:
55
- # doresample(result)
56
- # except:
57
- # pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
60
  audio = AudioSegment.from_file(result)
61
  current_sr = audio.frame_rate
62
  if current_sr > 24000:
 
63
  audio = audio.set_frame_rate(24000)
64
  try:
65
- print('Trying to normalize audio')
66
  audio = match_target_amplitude(audio, -20)
67
  except:
68
- print('[WARN] Unable to normalize audio')
69
  audio.export(f.name, format="wav")
70
  os.unlink(result)
71
  result = f.name
 
72
  except:
 
73
  pass
74
  if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
75
- print(model)
76
- print(f"Running model {model}")
77
  result_storage[model] = result
78
- # try:
79
- # doloudnorm(result)
80
- # except:
81
- # pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  mdl1k = mdl1
83
  mdl2k = mdl2
84
  print(mdl1k, mdl2k)
@@ -86,24 +210,39 @@ def synthandreturn(text):
86
  if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
87
  results = {}
88
  print(f"Sending models {mdl1k} and {mdl2k} to API")
89
- thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
90
- thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
91
-
92
- thread1.start()
93
- thread2.start()
94
- thread1.join()
95
- thread2.join()
96
- #debug
97
- # print(results)
98
- # print(list(results.keys())[0])
99
- # y, sr = librosa.load(results[list(results.keys())[0]], sr=None)
100
- # print(sr)
101
- # print(list(results.keys())[1])
102
- # y, sr = librosa.load(results[list(results.keys())[1]], sr=None)
103
- # print(sr)
104
- #debug
105
- # outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
106
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
108
  return (
109
  text,
@@ -194,7 +333,7 @@ def synthandreturn_battle(text, mdl1, mdl2):
194
  print(f"Sending models {mdl1k} and {mdl2k} to API")
195
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
196
  thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
197
-
198
  thread1.start()
199
  thread2.start()
200
  thread1.join()
 
1
+ import time
2
  from .models import *
3
  from .utils import *
4
  from .config import *
 
10
  from langdetect import detect
11
  from .vote import log_text
12
 
13
+ top_five = []
14
+ hf_token=os.getenv('HF_TOKEN')
15
+
16
  def random_m():
17
  return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
18
 
 
21
  return False
22
  return toxicity.predict(text)['toxicity'] > 0.8
23
 
24
+ def synthandreturn(text, request: gr.Request):
25
  text = text.strip()
26
  if len(text) > MAX_SAMPLE_TXT_LENGTH:
27
  raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
 
38
  raise gr.Error(f'You did not enter any text')
39
  # Check language
40
  try:
41
+ if (
42
+ text not in sents
43
+ and not detect(text) == "en"
44
+ ):
45
  gr.Warning('Warning: The input text may not be in English')
46
  except:
47
  pass
48
  # Get two random models
49
+
50
+ # forced model: your TTS model versus The World!!!
51
+ # mdl1 = 'Pendrokar/xVASynth'
52
+
53
+ # scrutinize the top five by always picking one of them
54
+ if (len(top_five) >= 5):
55
+ mdl1 = random.sample(top_five, 1)[0]
56
+ vsModels = dict(AVAILABLE_MODELS)
57
+ del vsModels[mdl1]
58
+ # randomize position of the forced model
59
+ mdl2 = random.sample(list(vsModels.keys()), 1)
60
+ # forced random
61
+ mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
62
+ else:
63
+ # actual random
64
+ mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
65
+
66
  print("[debug] Using", mdl1, mdl2)
67
+ def predict_and_update_result(text, model, result_storage, request:gr.Request):
68
+
69
+ hf_headers = {}
70
  try:
71
+ if HF_SPACES[model]['is_zero_gpu_space']:
72
+ hf_headers = {"X-IP-Token": request.headers['x-ip-token']}
 
 
73
  except:
74
+ pass
75
+
76
+ # re-attempt if necessary
77
+ attempt_count = 0
78
+ max_attempts = 1 # 3 =May cause 429 Too Many Request
79
+ while attempt_count < max_attempts:
80
+ try:
81
+ if model in AVAILABLE_MODELS:
82
+ if '/' in model:
83
+ # Use public HF Space
84
+ # if (model not in hf_clients):
85
+ # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
86
+ mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
87
+
88
+ # print(f"{model}: Fetching endpoints of HF Space")
89
+ # assume the index is one of the first 9 return params
90
+ return_audio_index = int(HF_SPACES[model]['return_audio_index'])
91
+ endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
92
+
93
+ api_name = None
94
+ fn_index = None
95
+ end_parameters = None
96
+ # has named endpoint
97
+ if '/' == HF_SPACES[model]['function'][0]:
98
+ # audio sync function name
99
+ api_name = HF_SPACES[model]['function']
100
+
101
+ end_parameters = _get_param_examples(
102
+ endpoints['named_endpoints'][api_name]['parameters']
103
+ )
104
+ # has unnamed endpoint
105
+ else:
106
+ # endpoint index is the first character
107
+ fn_index = int(HF_SPACES[model]['function'])
108
+
109
+ end_parameters = _get_param_examples(
110
+ endpoints['unnamed_endpoints'][str(fn_index)]['parameters']
111
+ )
112
+
113
+ # override some or all default parameters
114
+ space_inputs = _override_params(end_parameters, model)
115
+
116
+ # force text
117
+ space_inputs[HF_SPACES[model]['text_param_index']] = text
118
+
119
+ print(f"{model}: Sending request to HF Space")
120
+ results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
121
+
122
+ # return path to audio
123
+ result = results
124
+ if (not isinstance(results, str)):
125
+ # return_audio_index may be a filepath string
126
+ result = results[return_audio_index]
127
+ if (isinstance(result, dict)):
128
+ # return_audio_index is a dictionary
129
+ result = results[return_audio_index]['value']
130
+ else:
131
+ # Use the private HF Space
132
+ result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
133
+ else:
134
+ result = router.predict(text, model.lower(), api_name="/synthesize")
135
+ break
136
+ except Exception as e:
137
+ attempt_count += 1
138
+ raise gr.Error(f"{model}:"+ repr(e))
139
+ # print(f"{model}: Unable to call API (attempt: {attempt_count})")
140
+ # sleep for three seconds to avoid spamming the server with requests
141
+ # time.sleep(3)
142
+
143
+ # Fetch and store client again
144
+ # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
145
+
146
+ if attempt_count >= max_attempts:
147
+ raise gr.Error(f"{model}: Failed to call model")
148
+ else:
149
+ print('Done with', model)
150
+
151
  try:
152
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
153
  audio = AudioSegment.from_file(result)
154
  current_sr = audio.frame_rate
155
  if current_sr > 24000:
156
+ print(f"{model}: Resampling")
157
  audio = audio.set_frame_rate(24000)
158
  try:
159
+ print(f"{model}: Trying to normalize audio")
160
  audio = match_target_amplitude(audio, -20)
161
  except:
162
+ print(f"{model}: [WARN] Unable to normalize audio")
163
  audio.export(f.name, format="wav")
164
  os.unlink(result)
165
  result = f.name
166
+ gr.Info('Audio from a TTS model received')
167
  except:
168
+ print(f"{model}: [WARN] Unable to resample audio")
169
  pass
170
  if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
 
 
171
  result_storage[model] = result
172
+
173
+ def _get_param_examples(parameters):
174
+ example_inputs = []
175
+ for param_info in parameters:
176
+ if (
177
+ param_info['component'] == 'Radio'
178
+ or param_info['component'] == 'Dropdown'
179
+ or param_info['component'] == 'Audio'
180
+ or param_info['python_type']['type'] == 'str'
181
+ ):
182
+ example_inputs.append(str(param_info['example_input']))
183
+ continue
184
+ if param_info['python_type']['type'] == 'int':
185
+ example_inputs.append(int(param_info['example_input']))
186
+ continue
187
+ if param_info['python_type']['type'] == 'float':
188
+ example_inputs.append(float(param_info['example_input']))
189
+ continue
190
+ if param_info['python_type']['type'] == 'bool':
191
+ example_inputs.append(bool(param_info['example_input']))
192
+ continue
193
+
194
+ return example_inputs
195
+
196
+ def _override_params(inputs, modelname):
197
+ try:
198
+ for key,value in OVERRIDE_INPUTS[modelname].items():
199
+ inputs[key] = value
200
+ print(f"{modelname}: Default inputs overridden by Arena")
201
+ except:
202
+ pass
203
+
204
+ return inputs
205
+
206
  mdl1k = mdl1
207
  mdl2k = mdl2
208
  print(mdl1k, mdl2k)
 
210
  if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
211
  results = {}
212
  print(f"Sending models {mdl1k} and {mdl2k} to API")
213
+
214
+ # do not use multithreading when both spaces are ZeroGPU type
215
+ if (
216
+ # exists
217
+ 'is_zero_gpu_space' in HF_SPACES[mdl1]
218
+ # is True
219
+ and HF_SPACES[mdl1]['is_zero_gpu_space']
220
+ and 'is_zero_gpu_space' in HF_SPACES[mdl2]
221
+ and HF_SPACES[mdl2]['is_zero_gpu_space']
222
+ ):
223
+ # run Zero-GPU spaces one at a time
224
+ predict_and_update_result(text, mdl1k, results, request)
225
+ # _cache_sample(text, mdl1k)
226
+
227
+ predict_and_update_result(text, mdl2k, results, request)
228
+ # _cache_sample(text, mdl2k)
229
+ else:
230
+ # use multithreading
231
+ thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
232
+ thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results, request))
233
+
234
+ thread1.start()
235
+ # wait 3 seconds to calm hf.space domain
236
+ time.sleep(3)
237
+ thread2.start()
238
+ # timeout in 2 minutes
239
+ thread1.join(120)
240
+ thread2.join(120)
241
+
242
+ # cache the result
243
+ # for model in [mdl1k, mdl2k]:
244
+ # _cache_sample(text, model)
245
+
246
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
247
  return (
248
  text,
 
333
  print(f"Sending models {mdl1k} and {mdl2k} to API")
334
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
335
  thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
336
+
337
  thread1.start()
338
  thread2.start()
339
  thread1.join()