TTS-Spaces-Arena

Running on Zero

App Files Files Community

Pendrokar commited on Oct 27, 2024

Commit

66182b2

1 Parent(s): eef0c06

Gradio Client TTS support

Browse files

Files changed (3) hide show

app/config.py +2 -1
app/models.py +333 -12
app/synth.py +181 -42

app/config.py CHANGED Viewed

@@ -15,7 +15,8 @@ MIN_SAMPLE_TXT_LENGTH = 10 # Minimum text length (characters)
 DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
-ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
 SYNC_DB = True # Sync DB to HF dataset?
 DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing

 DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
+ROUTER_ID = "Pendrokar/xVASynth-TTS" # You should use a router space to route TTS models to avoid exposing your API keys!
+# ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
 SYNC_DB = True # Sync DB to HF dataset?
 DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing

app/models.py CHANGED Viewed

@@ -1,21 +1,342 @@
 # Models to include in the leaderboard, only include models that users can vote on
 AVAILABLE_MODELS = {
-    'XTTSv2': 'xtts',
     # 'WhisperSpeech': 'whisperspeech',
-    'ElevenLabs': 'eleven',
     # 'OpenVoice': 'openvoice',
-    'OpenVoice V2': 'openvoicev2',
-    'Play.HT 2.0': 'playht',
-    'Play.HT 3.0 Mini': 'playht3',
     # 'MetaVoice': 'metavoice',
-    'MeloTTS': 'melo',
-    'StyleTTS 2': 'styletts2',
-    'GPT-SoVITS': 'sovits',
     # 'Vokan TTS': 'vokan',
-    'VoiceCraft 2.0': 'voicecraft',
-    'Parler TTS': 'parler',
-    'Parler TTS Large': 'parlerlarge',
-    'Fish Speech v1.4': 'fish',
 }

+from gradio_client import handle_file
 # Models to include in the leaderboard, only include models that users can vote on
 AVAILABLE_MODELS = {
+    # 'XTTSv2': 'xtts',
     # 'WhisperSpeech': 'whisperspeech',
+    # 'ElevenLabs': 'eleven',
     # 'OpenVoice': 'openvoice',
+    # 'OpenVoice V2': 'openvoicev2',
+    # 'Play.HT 2.0': 'playht',
+    # 'Play.HT 3.0 Mini': 'playht3',
     # 'MetaVoice': 'metavoice',
+    # 'MeloTTS': 'melo',
+    # 'StyleTTS 2': 'styletts2',
+    # 'GPT-SoVITS': 'sovits',
     # 'Vokan TTS': 'vokan',
+    # 'VoiceCraft 2.0': 'voicecraft',
+    # 'Parler TTS': 'parler',
+    # 'Parler TTS Large': 'parlerlarge',
+    # 'Fish Speech v1.4': 'fish',
+    # HF Gradio Spaces: # <works with gradio version #>
+    # gravio version that works with most spaces: 4.29
+    # 'coqui/xtts': 'coqui/xtts', # 4.29 4.32
+    # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
+    # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
+    # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
+    # 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
+    'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
+    # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
+    # 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
+    # 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1
+    # E2 & F5 TTS
+    # F5 model
+    # 'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
+    # # Parler
+    # Parler Large model
+    # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
+    # Parler Mini model
+    # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
+    # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
+    # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
+    # # Microsoft Edge TTS
+    # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
+    # IMS-Toucan
+    # 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
+    # HF TTS w issues
+    'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
+    # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
+    # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
+    # 'suno/bark': '3#0', # Hallucinates
+    # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
+    # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
+    # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
+    # 'pytorch/Tacotron2': '0#0', # old gradio
+}
+HF_SPACES = {
+    # XTTS v2
+    'coqui/xtts': {
+        'name': 'XTTS v2',
+        'function': '1',
+        'text_param_index': 0,
+        'return_audio_index': 1,
+        'series': 'XTTS',
+    },
+    # WhisperSpeech
+    'collabora/WhisperSpeech': {
+        'name': 'WhisperSpeech',
+        'function': '/whisper_speech_demo',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'WhisperSpeech',
+    },
+    # OpenVoice (MyShell.ai)
+    'myshell-ai/OpenVoice': {
+        'name':'OpenVoice',
+        'function': '1',
+        'text_param_index': 0,
+        'return_audio_index': 1,
+        'series': 'OpenVoice',
+    },
+    # OpenVoice v2 (MyShell.ai)
+    'myshell-ai/OpenVoiceV2': {
+        'name':'OpenVoice v2',
+        'function': '1',
+        'text_param_index': 0,
+        'return_audio_index': 1,
+        'series': 'OpenVoice',
+    },
+    # MetaVoice
+    'mrfakename/MetaVoice-1B-v0.1': {
+        'name':'MetaVoice-1B',
+        'function': '/tts',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'MetaVoice-1B',
+    },
+    # xVASynth (CPU)
+    'Pendrokar/xVASynth-TTS': {
+        'name': 'xVASynth v3',
+        'function': '/predict',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'xVASynth',
+    },
+    # CoquiTTS (CPU)
+    'coqui/CoquiTTS': {
+        'name': 'CoquiTTS',
+        'function': '0',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'CoquiTTS',
+    },
+    # HierSpeech_TTS
+    'LeeSangHoon/HierSpeech_TTS': {
+        'name': 'HierSpeech++',
+        'function': '/predict',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'HierSpeech++',
+    },
+    # MeloTTS (MyShell.ai)
+    'mrfakename/MeloTTS': {
+        'name': 'MeloTTS',
+        'function': '/synthesize',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'MeloTTS',
+    },
+    # Parler
+    'parler-tts/parler_tts': {
+        'name': 'Parler Mini',
+        'function': '/gen_tts',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'Parler',
+    },
+    # Parler Mini
+    # 'parler-tts/parler_tts': {
+    #     'name': 'Parler Large',
+    #     'function': '/gen_tts',
+    #     'text_param_index': 0,
+    #     'return_audio_index': 0,
+    #     'is_zero_gpu_space': True,
+    #    'series': 'Parler',
+    # },
+    # Parler Mini which using Expresso dataset
+    'parler-tts/parler-tts-expresso': {
+        'name': 'Parler Mini Expresso',
+        'function': '/gen_tts',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'Parler',
+    },
+    # Microsoft Edge TTS
+    'innoai/Edge-TTS-Text-to-Speech': {
+        'name': 'Edge TTS',
+        'function': '/predict',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'is_proprietary': True,
+        'series': 'Edge TTS',
+    },
+    # Fish Speech
+    'fishaudio/fish-speech-1': {
+        'name': 'Fish Speech',
+        'function': '/inference_wrapper',
+        'text_param_index': 0,
+        'return_audio_index': 1,
+        'series': 'Fish Speech',
+    },
+    # E2/F5 TTS
+    'mrfakename/E2-F5-TTS': {
+        'name': 'F5 of E2 TTS',
+        'function': '/infer',
+        'text_param_index': 2,
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'E2/F5 TTS',
+    },
+    # IMS-Toucan
+    'Flux9665/MassivelyMultilingualTTS': {
+        'name': 'IMS-Toucan',
+		'function': "/predict",
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'IMS-Toucan',
+    },
+    # IMS-Toucan English non-artificial
+    'Flux9665/EnglishToucan': {
+        'name': 'IMS-Toucan EN',
+		'function': "/predict",
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'series': 'IMS-Toucan',
+    },
+    # StyleTTS v2
+    'Pendrokar/style-tts-2': {
+        'name': 'StyleTTS v2',
+        'function': '/synthesize',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'StyleTTS',
+    },
+}
+# for zero-shot TTS - voice sample used by XTTS (11 seconds)
+DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
+DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
+DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
+OVERRIDE_INPUTS = {
+    'coqui/xtts': {
+        1: 'en',
+        2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
+        3: None, # mic voice sample
+        4: False, #use_mic
+        5: False, #cleanup_reference
+        6: False, #auto_detect
+    },
+    'collabora/WhisperSpeech': {
+        1: DEFAULT_VOICE_SAMPLE, # voice sample
+        2: DEFAULT_VOICE_SAMPLE, # voice sample URL
+        3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
+    },
+    'myshell-ai/OpenVoice': {
+        1: 'default', # style
+        2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
+    },
+    'myshell-ai/OpenVoiceV2': {
+        1: 'en_us', # style
+        2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
+    },
+    'PolyAI/pheme': {
+        1: 'YOU1000000044_S0000798', # voice
+        2: 210,
+        3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
+    },
+    'Pendrokar/xVASynth-TTS': {
+        1: 'x_ex04', #fine-tuned voice model name
+        3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
+    },
+    'suno/bark': {
+        1: 'Speaker 3 (en)', # voice
+    },
+    'amphion/Text-to-Speech': {
+        1: 'LikeManyWaters', # voice
+    },
+    'LeeSangHoon/HierSpeech_TTS': {
+        1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
+        2: 0.333,
+        3: 0.333,
+        4: 1,
+        5: 1,
+        6: 0,
+        7: 1111,
+    },
+    'Manmay/tortoise-tts': {
+        1: None, # text-from-file
+        2: 'angie', # voice
+        3: 'disabled', # second voice for a dialogue
+        4: 'No', # split by newline
+    },
+    'mrfakename/MeloTTS': {
+        1: 'EN-Default',	# speaker; DEFAULT_VOICE_SAMPLE=EN-Default
+        2: 1, # speed
+        3: 'EN',	# language
+    },
+    'mrfakename/MetaVoice-1B-v0.1': {
+		1: 5,	# float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
+		2: 5,	# float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
+		3: "Preset voices",	# Literal['Preset voices', 'Upload target voice']  in 'Choose voice' Radio component
+		4: "Bria",	# Literal['Bria', 'Alex', 'Jacob']  in 'Preset voices' Dropdown component
+		5: None,	# filepath  in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
+    },
+    'parler-tts/parler_tts': {
+        1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
+    },
+    'parler-tts/parler-tts-expresso': {
+        1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
+    },
+    'innoai/Edge-TTS-Text-to-Speech': {
+        1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
+        2: 0, # pace rate
+        3: 0, # pitch
+    },
+    'fishaudio/fish-speech-1': {
+		1: True, # enable_reference_audio
+		2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
+		3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
+		4: 0, # max_new_tokens
+		5: 200, # chunk_length
+		6: 0.7, # top_p
+		7: 1.2, # repetition_penalty
+		8: 0.7, # temperature
+		9: 1, # batch_infer_num
+		10: False, # if_load_asr_model
+    },
+    'mrfakename/E2-F5-TTS': {
+		0: DEFAULT_VOICE_SAMPLE, # voice sample
+		1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
+		3: "F5-TTS", # model
+		4: False, # cleanup silence
+    },
+    # IMS-Toucan
+    'Flux9665/MassivelyMultilingualTTS': {
+		1: "English (eng)", #language
+		2: 0.6, #prosody_creativity
+		3: 1, #duration_scaling_factor
+		4: 41, #voice_seed
+		5: -7.5, #emb1
+		6: None, #reference_audio
+    },
+    # StyleTTS 2
+    'Pendrokar/style-tts-2': {
+		1: "f-us-2", #voice
+        2: 'en-us', # lang
+		3: 8, # lngsteps
+    },
 }

app/synth.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .models import *
 from .utils import *
 from .config import *
@@ -9,6 +10,9 @@ import random, os, threading, tempfile
 from langdetect import detect
 from .vote import log_text
 def random_m():
     return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
@@ -17,7 +21,7 @@ def check_toxicity(text):
         return False
     return toxicity.predict(text)['toxicity'] > 0.8
-def synthandreturn(text):
     text = text.strip()
     if len(text) > MAX_SAMPLE_TXT_LENGTH:
         raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
@@ -34,51 +38,171 @@ def synthandreturn(text):
         raise gr.Error(f'You did not enter any text')
     # Check language
     try:
-        if not detect(text) == "en":
             gr.Warning('Warning: The input text may not be in English')
     except:
         pass
     # Get two random models
-    mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
-    log_text(text)
     print("[debug] Using", mdl1, mdl2)
-    def predict_and_update_result(text, model, result_storage):
         try:
-            if model in AVAILABLE_MODELS:
-                result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
-            else:
-                result = router.predict(text, model.lower(), api_name="/synthesize")
         except:
-            raise gr.Error('Unable to call API, please try again :)')
-        print('Done with', model)
-        # try:
-        #     doresample(result)
-        # except:
-        #     pass
         try:
             with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                 audio = AudioSegment.from_file(result)
                 current_sr = audio.frame_rate
                 if current_sr > 24000:
                     audio = audio.set_frame_rate(24000)
                 try:
-                    print('Trying to normalize audio')
                     audio = match_target_amplitude(audio, -20)
                 except:
-                    print('[WARN] Unable to normalize audio')
                 audio.export(f.name, format="wav")
                 os.unlink(result)
                 result = f.name
         except:
             pass
         if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
-        print(model)
-        print(f"Running model {model}")
         result_storage[model] = result
-        # try:
-        #     doloudnorm(result)
-        # except:
-        #     pass
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
@@ -86,24 +210,39 @@ def synthandreturn(text):
     if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
-    thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
-    thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
-    thread1.start()
-    thread2.start()
-    thread1.join()
-    thread2.join()
-    #debug
-    # print(results)
-    # print(list(results.keys())[0])
-    # y, sr = librosa.load(results[list(results.keys())[0]], sr=None)
-    # print(sr)
-    # print(list(results.keys())[1])
-    # y, sr = librosa.load(results[list(results.keys())[1]], sr=None)
-    # print(sr)
-    #debug
-    #     outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
     print(f"Retrieving models {mdl1k} and {mdl2k} from API")
     return (
         text,
@@ -194,7 +333,7 @@ def synthandreturn_battle(text, mdl1, mdl2):
     print(f"Sending models {mdl1k} and {mdl2k} to API")
     thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
     thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
     thread1.start()
     thread2.start()
     thread1.join()

+import time
 from .models import *
 from .utils import *
 from .config import *
 from langdetect import detect
 from .vote import log_text
+top_five = []
+hf_token=os.getenv('HF_TOKEN')
 def random_m():
     return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
         return False
     return toxicity.predict(text)['toxicity'] > 0.8
+def synthandreturn(text, request: gr.Request):
     text = text.strip()
     if len(text) > MAX_SAMPLE_TXT_LENGTH:
         raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
         raise gr.Error(f'You did not enter any text')
     # Check language
     try:
+        if (
+            text not in sents
+            and not detect(text) == "en"
+        ):
             gr.Warning('Warning: The input text may not be in English')
     except:
         pass
     # Get two random models
+    # forced model: your TTS model versus The World!!!
+    # mdl1 = 'Pendrokar/xVASynth'
+    # scrutinize the top five by always picking one of them
+    if (len(top_five) >= 5):
+        mdl1 = random.sample(top_five, 1)[0]
+        vsModels = dict(AVAILABLE_MODELS)
+        del vsModels[mdl1]
+        # randomize position of the forced model
+        mdl2 = random.sample(list(vsModels.keys()), 1)
+        # forced random
+        mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
+    else:
+        # actual random
+        mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
     print("[debug] Using", mdl1, mdl2)
+    def predict_and_update_result(text, model, result_storage, request:gr.Request):
+        hf_headers = {}
         try:
+            if HF_SPACES[model]['is_zero_gpu_space']:
+                hf_headers = {"X-IP-Token": request.headers['x-ip-token']}
         except:
+            pass
+        # re-attempt if necessary
+        attempt_count = 0
+        max_attempts = 1 # 3 =May cause 429 Too Many Request
+        while attempt_count < max_attempts:
+            try:
+                if model in AVAILABLE_MODELS:
+                    if '/' in model:
+                        # Use public HF Space
+                        # if (model not in hf_clients):
+                        #     hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
+                        mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
+                        # print(f"{model}: Fetching endpoints of HF Space")
+                        # assume the index is one of the first 9 return params
+                        return_audio_index = int(HF_SPACES[model]['return_audio_index'])
+                        endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
+                        api_name = None
+                        fn_index = None
+                        end_parameters = None
+                        # has named endpoint
+                        if '/' == HF_SPACES[model]['function'][0]:
+                            # audio sync function name
+                            api_name = HF_SPACES[model]['function']
+                            end_parameters = _get_param_examples(
+                                endpoints['named_endpoints'][api_name]['parameters']
+                            )
+                        # has unnamed endpoint
+                        else:
+                            # endpoint index is the first character
+                            fn_index = int(HF_SPACES[model]['function'])
+                            end_parameters = _get_param_examples(
+                                endpoints['unnamed_endpoints'][str(fn_index)]['parameters']
+                            )
+                        # override some or all default parameters
+                        space_inputs = _override_params(end_parameters, model)
+                        # force text
+                        space_inputs[HF_SPACES[model]['text_param_index']] = text
+                        print(f"{model}: Sending request to HF Space")
+                        results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
+                        # return path to audio
+                        result = results
+                        if (not isinstance(results, str)):
+                            # return_audio_index may be a filepath string
+                            result = results[return_audio_index]
+                        if (isinstance(result, dict)):
+                            # return_audio_index is a dictionary
+                            result = results[return_audio_index]['value']
+                    else:
+                        # Use the private HF Space
+                        result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
+                else:
+                    result = router.predict(text, model.lower(), api_name="/synthesize")
+                break
+            except Exception as e:
+                attempt_count += 1
+                raise gr.Error(f"{model}:"+ repr(e))
+                # print(f"{model}: Unable to call API (attempt: {attempt_count})")
+                # sleep for three seconds to avoid spamming the server with requests
+                # time.sleep(3)
+                # Fetch and store client again
+                # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
+        if attempt_count >= max_attempts:
+            raise gr.Error(f"{model}: Failed to call model")
+        else:
+            print('Done with', model)
         try:
             with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                 audio = AudioSegment.from_file(result)
                 current_sr = audio.frame_rate
                 if current_sr > 24000:
+                    print(f"{model}: Resampling")
                     audio = audio.set_frame_rate(24000)
                 try:
+                    print(f"{model}: Trying to normalize audio")
                     audio = match_target_amplitude(audio, -20)
                 except:
+                    print(f"{model}: [WARN] Unable to normalize audio")
                 audio.export(f.name, format="wav")
                 os.unlink(result)
                 result = f.name
+                gr.Info('Audio from a TTS model received')
         except:
+            print(f"{model}: [WARN] Unable to resample audio")
             pass
         if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
         result_storage[model] = result
+    def _get_param_examples(parameters):
+        example_inputs = []
+        for param_info in parameters:
+            if (
+                param_info['component'] == 'Radio'
+                or param_info['component'] == 'Dropdown'
+                or param_info['component'] == 'Audio'
+                or param_info['python_type']['type'] == 'str'
+            ):
+                example_inputs.append(str(param_info['example_input']))
+                continue
+            if param_info['python_type']['type'] == 'int':
+                example_inputs.append(int(param_info['example_input']))
+                continue
+            if param_info['python_type']['type'] == 'float':
+                example_inputs.append(float(param_info['example_input']))
+                continue
+            if param_info['python_type']['type'] == 'bool':
+                example_inputs.append(bool(param_info['example_input']))
+                continue
+        return example_inputs
+    def _override_params(inputs, modelname):
+        try:
+            for key,value in OVERRIDE_INPUTS[modelname].items():
+                inputs[key] = value
+            print(f"{modelname}: Default inputs overridden by Arena")
+        except:
+            pass
+        return inputs
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
     if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
+    # do not use multithreading when both spaces are ZeroGPU type
+    if (
+        # exists
+        'is_zero_gpu_space' in HF_SPACES[mdl1]
+        # is True
+        and HF_SPACES[mdl1]['is_zero_gpu_space']
+        and 'is_zero_gpu_space' in HF_SPACES[mdl2]
+        and HF_SPACES[mdl2]['is_zero_gpu_space']
+    ):
+        # run Zero-GPU spaces one at a time
+        predict_and_update_result(text, mdl1k, results, request)
+        # _cache_sample(text, mdl1k)
+        predict_and_update_result(text, mdl2k, results, request)
+        # _cache_sample(text, mdl2k)
+    else:
+        # use multithreading
+        thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
+        thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results, request))
+        thread1.start()
+        # wait 3 seconds to calm hf.space domain
+        time.sleep(3)
+        thread2.start()
+        # timeout in 2 minutes
+        thread1.join(120)
+        thread2.join(120)
+        # cache the result
+        # for model in [mdl1k, mdl2k]:
+        #     _cache_sample(text, model)
     print(f"Retrieving models {mdl1k} and {mdl2k} from API")
     return (
         text,
     print(f"Sending models {mdl1k} and {mdl2k} to API")
     thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
     thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
     thread1.start()
     thread2.start()
     thread1.join()