Spaces:
Running
on
Zero
Running
on
Zero
Gradio Client TTS support
Browse files- app/config.py +2 -1
- app/models.py +333 -12
- app/synth.py +181 -42
app/config.py
CHANGED
@@ -15,7 +15,8 @@ MIN_SAMPLE_TXT_LENGTH = 10 # Minimum text length (characters)
|
|
15 |
|
16 |
DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
|
17 |
|
18 |
-
ROUTER_ID = "
|
|
|
19 |
|
20 |
SYNC_DB = True # Sync DB to HF dataset?
|
21 |
DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing
|
|
|
15 |
|
16 |
DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME # If /data available => means local storage is enabled => let's use it!
|
17 |
|
18 |
+
ROUTER_ID = "Pendrokar/xVASynth-TTS" # You should use a router space to route TTS models to avoid exposing your API keys!
|
19 |
+
# ROUTER_ID = "TTS-AGI/tts-router" # You should use a router space to route TTS models to avoid exposing your API keys!
|
20 |
|
21 |
SYNC_DB = True # Sync DB to HF dataset?
|
22 |
DB_DATASET_ID = os.getenv('DATASET_ID') # HF dataset ID, can be None if not syncing
|
app/models.py
CHANGED
@@ -1,21 +1,342 @@
|
|
|
|
|
|
1 |
# Models to include in the leaderboard, only include models that users can vote on
|
2 |
AVAILABLE_MODELS = {
|
3 |
-
'XTTSv2': 'xtts',
|
4 |
# 'WhisperSpeech': 'whisperspeech',
|
5 |
-
'ElevenLabs': 'eleven',
|
6 |
# 'OpenVoice': 'openvoice',
|
7 |
-
'OpenVoice V2': 'openvoicev2',
|
8 |
-
'Play.HT 2.0': 'playht',
|
9 |
-
'Play.HT 3.0 Mini': 'playht3',
|
10 |
# 'MetaVoice': 'metavoice',
|
11 |
-
'MeloTTS': 'melo',
|
12 |
-
'StyleTTS 2': 'styletts2',
|
13 |
-
'GPT-SoVITS': 'sovits',
|
14 |
# 'Vokan TTS': 'vokan',
|
15 |
-
'VoiceCraft 2.0': 'voicecraft',
|
16 |
-
'Parler TTS': 'parler',
|
17 |
-
'Parler TTS Large': 'parlerlarge',
|
18 |
-
'Fish Speech v1.4': 'fish',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
}
|
20 |
|
21 |
|
|
|
1 |
+
from gradio_client import handle_file
|
2 |
+
|
3 |
# Models to include in the leaderboard, only include models that users can vote on
|
4 |
AVAILABLE_MODELS = {
|
5 |
+
# 'XTTSv2': 'xtts',
|
6 |
# 'WhisperSpeech': 'whisperspeech',
|
7 |
+
# 'ElevenLabs': 'eleven',
|
8 |
# 'OpenVoice': 'openvoice',
|
9 |
+
# 'OpenVoice V2': 'openvoicev2',
|
10 |
+
# 'Play.HT 2.0': 'playht',
|
11 |
+
# 'Play.HT 3.0 Mini': 'playht3',
|
12 |
# 'MetaVoice': 'metavoice',
|
13 |
+
# 'MeloTTS': 'melo',
|
14 |
+
# 'StyleTTS 2': 'styletts2',
|
15 |
+
# 'GPT-SoVITS': 'sovits',
|
16 |
# 'Vokan TTS': 'vokan',
|
17 |
+
# 'VoiceCraft 2.0': 'voicecraft',
|
18 |
+
# 'Parler TTS': 'parler',
|
19 |
+
# 'Parler TTS Large': 'parlerlarge',
|
20 |
+
# 'Fish Speech v1.4': 'fish',
|
21 |
+
|
22 |
+
# HF Gradio Spaces: # <works with gradio version #>
|
23 |
+
# gravio version that works with most spaces: 4.29
|
24 |
+
# 'coqui/xtts': 'coqui/xtts', # 4.29 4.32
|
25 |
+
# 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
|
26 |
+
# 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
|
27 |
+
# 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
|
28 |
+
# 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
|
29 |
+
'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
|
30 |
+
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
31 |
+
# 'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
|
32 |
+
# 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29 4.32 4.36.1
|
33 |
+
|
34 |
+
# E2 & F5 TTS
|
35 |
+
# F5 model
|
36 |
+
# 'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
|
37 |
+
|
38 |
+
# # Parler
|
39 |
+
# Parler Large model
|
40 |
+
# 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
41 |
+
# Parler Mini model
|
42 |
+
# 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
43 |
+
# 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
|
44 |
+
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
|
45 |
+
|
46 |
+
# # Microsoft Edge TTS
|
47 |
+
# 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
|
48 |
+
|
49 |
+
# IMS-Toucan
|
50 |
+
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
51 |
+
|
52 |
+
# HF TTS w issues
|
53 |
+
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
54 |
+
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
55 |
+
# 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
|
56 |
+
# 'suno/bark': '3#0', # Hallucinates
|
57 |
+
# 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
|
58 |
+
# 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
|
59 |
+
# 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
|
60 |
+
# 'pytorch/Tacotron2': '0#0', # old gradio
|
61 |
+
}
|
62 |
+
|
63 |
+
HF_SPACES = {
|
64 |
+
# XTTS v2
|
65 |
+
'coqui/xtts': {
|
66 |
+
'name': 'XTTS v2',
|
67 |
+
'function': '1',
|
68 |
+
'text_param_index': 0,
|
69 |
+
'return_audio_index': 1,
|
70 |
+
'series': 'XTTS',
|
71 |
+
},
|
72 |
+
# WhisperSpeech
|
73 |
+
'collabora/WhisperSpeech': {
|
74 |
+
'name': 'WhisperSpeech',
|
75 |
+
'function': '/whisper_speech_demo',
|
76 |
+
'text_param_index': 0,
|
77 |
+
'return_audio_index': 0,
|
78 |
+
'series': 'WhisperSpeech',
|
79 |
+
},
|
80 |
+
# OpenVoice (MyShell.ai)
|
81 |
+
'myshell-ai/OpenVoice': {
|
82 |
+
'name':'OpenVoice',
|
83 |
+
'function': '1',
|
84 |
+
'text_param_index': 0,
|
85 |
+
'return_audio_index': 1,
|
86 |
+
'series': 'OpenVoice',
|
87 |
+
},
|
88 |
+
# OpenVoice v2 (MyShell.ai)
|
89 |
+
'myshell-ai/OpenVoiceV2': {
|
90 |
+
'name':'OpenVoice v2',
|
91 |
+
'function': '1',
|
92 |
+
'text_param_index': 0,
|
93 |
+
'return_audio_index': 1,
|
94 |
+
'series': 'OpenVoice',
|
95 |
+
},
|
96 |
+
# MetaVoice
|
97 |
+
'mrfakename/MetaVoice-1B-v0.1': {
|
98 |
+
'name':'MetaVoice-1B',
|
99 |
+
'function': '/tts',
|
100 |
+
'text_param_index': 0,
|
101 |
+
'return_audio_index': 0,
|
102 |
+
'series': 'MetaVoice-1B',
|
103 |
+
},
|
104 |
+
# xVASynth (CPU)
|
105 |
+
'Pendrokar/xVASynth-TTS': {
|
106 |
+
'name': 'xVASynth v3',
|
107 |
+
'function': '/predict',
|
108 |
+
'text_param_index': 0,
|
109 |
+
'return_audio_index': 0,
|
110 |
+
'series': 'xVASynth',
|
111 |
+
},
|
112 |
+
# CoquiTTS (CPU)
|
113 |
+
'coqui/CoquiTTS': {
|
114 |
+
'name': 'CoquiTTS',
|
115 |
+
'function': '0',
|
116 |
+
'text_param_index': 0,
|
117 |
+
'return_audio_index': 0,
|
118 |
+
'series': 'CoquiTTS',
|
119 |
+
},
|
120 |
+
# HierSpeech_TTS
|
121 |
+
'LeeSangHoon/HierSpeech_TTS': {
|
122 |
+
'name': 'HierSpeech++',
|
123 |
+
'function': '/predict',
|
124 |
+
'text_param_index': 0,
|
125 |
+
'return_audio_index': 0,
|
126 |
+
'series': 'HierSpeech++',
|
127 |
+
},
|
128 |
+
# MeloTTS (MyShell.ai)
|
129 |
+
'mrfakename/MeloTTS': {
|
130 |
+
'name': 'MeloTTS',
|
131 |
+
'function': '/synthesize',
|
132 |
+
'text_param_index': 0,
|
133 |
+
'return_audio_index': 0,
|
134 |
+
'series': 'MeloTTS',
|
135 |
+
},
|
136 |
+
|
137 |
+
# Parler
|
138 |
+
'parler-tts/parler_tts': {
|
139 |
+
'name': 'Parler Mini',
|
140 |
+
'function': '/gen_tts',
|
141 |
+
'text_param_index': 0,
|
142 |
+
'return_audio_index': 0,
|
143 |
+
'is_zero_gpu_space': True,
|
144 |
+
'series': 'Parler',
|
145 |
+
},
|
146 |
+
# Parler Mini
|
147 |
+
# 'parler-tts/parler_tts': {
|
148 |
+
# 'name': 'Parler Large',
|
149 |
+
# 'function': '/gen_tts',
|
150 |
+
# 'text_param_index': 0,
|
151 |
+
# 'return_audio_index': 0,
|
152 |
+
# 'is_zero_gpu_space': True,
|
153 |
+
# 'series': 'Parler',
|
154 |
+
# },
|
155 |
+
# Parler Mini which using Expresso dataset
|
156 |
+
'parler-tts/parler-tts-expresso': {
|
157 |
+
'name': 'Parler Mini Expresso',
|
158 |
+
'function': '/gen_tts',
|
159 |
+
'text_param_index': 0,
|
160 |
+
'return_audio_index': 0,
|
161 |
+
'is_zero_gpu_space': True,
|
162 |
+
'series': 'Parler',
|
163 |
+
},
|
164 |
+
|
165 |
+
# Microsoft Edge TTS
|
166 |
+
'innoai/Edge-TTS-Text-to-Speech': {
|
167 |
+
'name': 'Edge TTS',
|
168 |
+
'function': '/predict',
|
169 |
+
'text_param_index': 0,
|
170 |
+
'return_audio_index': 0,
|
171 |
+
'is_proprietary': True,
|
172 |
+
'series': 'Edge TTS',
|
173 |
+
},
|
174 |
+
|
175 |
+
# Fish Speech
|
176 |
+
'fishaudio/fish-speech-1': {
|
177 |
+
'name': 'Fish Speech',
|
178 |
+
'function': '/inference_wrapper',
|
179 |
+
'text_param_index': 0,
|
180 |
+
'return_audio_index': 1,
|
181 |
+
'series': 'Fish Speech',
|
182 |
+
},
|
183 |
+
|
184 |
+
# E2/F5 TTS
|
185 |
+
'mrfakename/E2-F5-TTS': {
|
186 |
+
'name': 'F5 of E2 TTS',
|
187 |
+
'function': '/infer',
|
188 |
+
'text_param_index': 2,
|
189 |
+
'return_audio_index': 0,
|
190 |
+
'is_zero_gpu_space': True,
|
191 |
+
'series': 'E2/F5 TTS',
|
192 |
+
},
|
193 |
+
|
194 |
+
# IMS-Toucan
|
195 |
+
'Flux9665/MassivelyMultilingualTTS': {
|
196 |
+
'name': 'IMS-Toucan',
|
197 |
+
'function': "/predict",
|
198 |
+
'text_param_index': 0,
|
199 |
+
'return_audio_index': 0,
|
200 |
+
'series': 'IMS-Toucan',
|
201 |
+
},
|
202 |
+
|
203 |
+
# IMS-Toucan English non-artificial
|
204 |
+
'Flux9665/EnglishToucan': {
|
205 |
+
'name': 'IMS-Toucan EN',
|
206 |
+
'function': "/predict",
|
207 |
+
'text_param_index': 0,
|
208 |
+
'return_audio_index': 0,
|
209 |
+
'series': 'IMS-Toucan',
|
210 |
+
},
|
211 |
+
|
212 |
+
# StyleTTS v2
|
213 |
+
'Pendrokar/style-tts-2': {
|
214 |
+
'name': 'StyleTTS v2',
|
215 |
+
'function': '/synthesize',
|
216 |
+
'text_param_index': 0,
|
217 |
+
'return_audio_index': 0,
|
218 |
+
'is_zero_gpu_space': True,
|
219 |
+
'series': 'StyleTTS',
|
220 |
+
},
|
221 |
+
}
|
222 |
+
|
223 |
+
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
224 |
+
DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'
|
225 |
+
DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
|
226 |
+
DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
|
227 |
+
|
228 |
+
OVERRIDE_INPUTS = {
|
229 |
+
'coqui/xtts': {
|
230 |
+
1: 'en',
|
231 |
+
2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
|
232 |
+
3: None, # mic voice sample
|
233 |
+
4: False, #use_mic
|
234 |
+
5: False, #cleanup_reference
|
235 |
+
6: False, #auto_detect
|
236 |
+
},
|
237 |
+
'collabora/WhisperSpeech': {
|
238 |
+
1: DEFAULT_VOICE_SAMPLE, # voice sample
|
239 |
+
2: DEFAULT_VOICE_SAMPLE, # voice sample URL
|
240 |
+
3: 14.0, #Tempo - Gradio Slider issue: takes min. rather than value
|
241 |
+
},
|
242 |
+
'myshell-ai/OpenVoice': {
|
243 |
+
1: 'default', # style
|
244 |
+
2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
|
245 |
+
},
|
246 |
+
'myshell-ai/OpenVoiceV2': {
|
247 |
+
1: 'en_us', # style
|
248 |
+
2: 'https://huggingface.co/spaces/myshell-ai/OpenVoiceV2/resolve/main/examples/speaker0.mp3', # voice sample
|
249 |
+
},
|
250 |
+
'PolyAI/pheme': {
|
251 |
+
1: 'YOU1000000044_S0000798', # voice
|
252 |
+
2: 210,
|
253 |
+
3: 0.7, #Tempo - Gradio Slider issue: takes min. rather than value
|
254 |
+
},
|
255 |
+
'Pendrokar/xVASynth-TTS': {
|
256 |
+
1: 'x_ex04', #fine-tuned voice model name
|
257 |
+
3: 1.0, #pacing/duration - Gradio Slider issue: takes min. rather than value
|
258 |
+
},
|
259 |
+
'suno/bark': {
|
260 |
+
1: 'Speaker 3 (en)', # voice
|
261 |
+
},
|
262 |
+
'amphion/Text-to-Speech': {
|
263 |
+
1: 'LikeManyWaters', # voice
|
264 |
+
},
|
265 |
+
'LeeSangHoon/HierSpeech_TTS': {
|
266 |
+
1: handle_file('https://huggingface.co/spaces/LeeSangHoon/HierSpeech_TTS/resolve/main/example/female.wav'), # voice sample
|
267 |
+
2: 0.333,
|
268 |
+
3: 0.333,
|
269 |
+
4: 1,
|
270 |
+
5: 1,
|
271 |
+
6: 0,
|
272 |
+
7: 1111,
|
273 |
+
},
|
274 |
+
'Manmay/tortoise-tts': {
|
275 |
+
1: None, # text-from-file
|
276 |
+
2: 'angie', # voice
|
277 |
+
3: 'disabled', # second voice for a dialogue
|
278 |
+
4: 'No', # split by newline
|
279 |
+
},
|
280 |
+
'mrfakename/MeloTTS': {
|
281 |
+
1: 'EN-Default', # speaker; DEFAULT_VOICE_SAMPLE=EN-Default
|
282 |
+
2: 1, # speed
|
283 |
+
3: 'EN', # language
|
284 |
+
},
|
285 |
+
'mrfakename/MetaVoice-1B-v0.1': {
|
286 |
+
1: 5, # float (numeric value between 0.0 and 10.0) in 'Speech Stability - improves text following for a challenging speaker' Slider component
|
287 |
+
2: 5, # float (numeric value between 1.0 and 5.0) in 'Speaker similarity - How closely to match speaker identity and speech style.' Slider component
|
288 |
+
3: "Preset voices", # Literal['Preset voices', 'Upload target voice'] in 'Choose voice' Radio component
|
289 |
+
4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
|
290 |
+
5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
|
291 |
+
},
|
292 |
+
'parler-tts/parler_tts': {
|
293 |
+
1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
|
294 |
+
},
|
295 |
+
'parler-tts/parler-tts-expresso': {
|
296 |
+
1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
|
297 |
+
},
|
298 |
+
'innoai/Edge-TTS-Text-to-Speech': {
|
299 |
+
1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
|
300 |
+
2: 0, # pace rate
|
301 |
+
3: 0, # pitch
|
302 |
+
},
|
303 |
+
|
304 |
+
'fishaudio/fish-speech-1': {
|
305 |
+
1: True, # enable_reference_audio
|
306 |
+
2: handle_file('https://huggingface.co/spaces/fishaudio/fish-speech-1/resolve/main/examples/English.wav'), # reference_audio
|
307 |
+
3: 'In the ancient land of Eldoria, where the skies were painted with shades of mystic hues and the forests whispered secrets of old, there existed a dragon named Zephyros. Unlike the fearsome tales of dragons that plagued human hearts with terror, Zephyros was a creature of wonder and wisdom, revered by all who knew of his existence.', # reference_text
|
308 |
+
4: 0, # max_new_tokens
|
309 |
+
5: 200, # chunk_length
|
310 |
+
6: 0.7, # top_p
|
311 |
+
7: 1.2, # repetition_penalty
|
312 |
+
8: 0.7, # temperature
|
313 |
+
9: 1, # batch_infer_num
|
314 |
+
10: False, # if_load_asr_model
|
315 |
+
},
|
316 |
+
|
317 |
+
'mrfakename/E2-F5-TTS': {
|
318 |
+
0: DEFAULT_VOICE_SAMPLE, # voice sample
|
319 |
+
1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
|
320 |
+
3: "F5-TTS", # model
|
321 |
+
4: False, # cleanup silence
|
322 |
+
},
|
323 |
+
|
324 |
+
# IMS-Toucan
|
325 |
+
'Flux9665/MassivelyMultilingualTTS': {
|
326 |
+
1: "English (eng)", #language
|
327 |
+
2: 0.6, #prosody_creativity
|
328 |
+
3: 1, #duration_scaling_factor
|
329 |
+
4: 41, #voice_seed
|
330 |
+
5: -7.5, #emb1
|
331 |
+
6: None, #reference_audio
|
332 |
+
},
|
333 |
+
|
334 |
+
# StyleTTS 2
|
335 |
+
'Pendrokar/style-tts-2': {
|
336 |
+
1: "f-us-2", #voice
|
337 |
+
2: 'en-us', # lang
|
338 |
+
3: 8, # lngsteps
|
339 |
+
},
|
340 |
}
|
341 |
|
342 |
|
app/synth.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from .models import *
|
2 |
from .utils import *
|
3 |
from .config import *
|
@@ -9,6 +10,9 @@ import random, os, threading, tempfile
|
|
9 |
from langdetect import detect
|
10 |
from .vote import log_text
|
11 |
|
|
|
|
|
|
|
12 |
def random_m():
|
13 |
return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
|
14 |
|
@@ -17,7 +21,7 @@ def check_toxicity(text):
|
|
17 |
return False
|
18 |
return toxicity.predict(text)['toxicity'] > 0.8
|
19 |
|
20 |
-
def synthandreturn(text):
|
21 |
text = text.strip()
|
22 |
if len(text) > MAX_SAMPLE_TXT_LENGTH:
|
23 |
raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
|
@@ -34,51 +38,171 @@ def synthandreturn(text):
|
|
34 |
raise gr.Error(f'You did not enter any text')
|
35 |
# Check language
|
36 |
try:
|
37 |
-
if
|
|
|
|
|
|
|
38 |
gr.Warning('Warning: The input text may not be in English')
|
39 |
except:
|
40 |
pass
|
41 |
# Get two random models
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
print("[debug] Using", mdl1, mdl2)
|
45 |
-
def predict_and_update_result(text, model, result_storage):
|
|
|
|
|
46 |
try:
|
47 |
-
if model
|
48 |
-
|
49 |
-
else:
|
50 |
-
result = router.predict(text, model.lower(), api_name="/synthesize")
|
51 |
except:
|
52 |
-
|
53 |
-
|
54 |
-
#
|
55 |
-
|
56 |
-
#
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
try:
|
59 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
60 |
audio = AudioSegment.from_file(result)
|
61 |
current_sr = audio.frame_rate
|
62 |
if current_sr > 24000:
|
|
|
63 |
audio = audio.set_frame_rate(24000)
|
64 |
try:
|
65 |
-
print(
|
66 |
audio = match_target_amplitude(audio, -20)
|
67 |
except:
|
68 |
-
print(
|
69 |
audio.export(f.name, format="wav")
|
70 |
os.unlink(result)
|
71 |
result = f.name
|
|
|
72 |
except:
|
|
|
73 |
pass
|
74 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
75 |
-
print(model)
|
76 |
-
print(f"Running model {model}")
|
77 |
result_storage[model] = result
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
mdl1k = mdl1
|
83 |
mdl2k = mdl2
|
84 |
print(mdl1k, mdl2k)
|
@@ -86,24 +210,39 @@ def synthandreturn(text):
|
|
86 |
if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
|
87 |
results = {}
|
88 |
print(f"Sending models {mdl1k} and {mdl2k} to API")
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
108 |
return (
|
109 |
text,
|
@@ -194,7 +333,7 @@ def synthandreturn_battle(text, mdl1, mdl2):
|
|
194 |
print(f"Sending models {mdl1k} and {mdl2k} to API")
|
195 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
|
196 |
thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
|
197 |
-
|
198 |
thread1.start()
|
199 |
thread2.start()
|
200 |
thread1.join()
|
|
|
1 |
+
import time
|
2 |
from .models import *
|
3 |
from .utils import *
|
4 |
from .config import *
|
|
|
10 |
from langdetect import detect
|
11 |
from .vote import log_text
|
12 |
|
13 |
+
top_five = []
|
14 |
+
hf_token=os.getenv('HF_TOKEN')
|
15 |
+
|
16 |
def random_m():
|
17 |
return random.sample(list(set(AVAILABLE_MODELS.keys())), 2)
|
18 |
|
|
|
21 |
return False
|
22 |
return toxicity.predict(text)['toxicity'] > 0.8
|
23 |
|
24 |
+
def synthandreturn(text, request: gr.Request):
|
25 |
text = text.strip()
|
26 |
if len(text) > MAX_SAMPLE_TXT_LENGTH:
|
27 |
raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
|
|
|
38 |
raise gr.Error(f'You did not enter any text')
|
39 |
# Check language
|
40 |
try:
|
41 |
+
if (
|
42 |
+
text not in sents
|
43 |
+
and not detect(text) == "en"
|
44 |
+
):
|
45 |
gr.Warning('Warning: The input text may not be in English')
|
46 |
except:
|
47 |
pass
|
48 |
# Get two random models
|
49 |
+
|
50 |
+
# forced model: your TTS model versus The World!!!
|
51 |
+
# mdl1 = 'Pendrokar/xVASynth'
|
52 |
+
|
53 |
+
# scrutinize the top five by always picking one of them
|
54 |
+
if (len(top_five) >= 5):
|
55 |
+
mdl1 = random.sample(top_five, 1)[0]
|
56 |
+
vsModels = dict(AVAILABLE_MODELS)
|
57 |
+
del vsModels[mdl1]
|
58 |
+
# randomize position of the forced model
|
59 |
+
mdl2 = random.sample(list(vsModels.keys()), 1)
|
60 |
+
# forced random
|
61 |
+
mdl1, mdl2 = random.sample(list([mdl1, mdl2[0]]), 2)
|
62 |
+
else:
|
63 |
+
# actual random
|
64 |
+
mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
|
65 |
+
|
66 |
print("[debug] Using", mdl1, mdl2)
|
67 |
+
def predict_and_update_result(text, model, result_storage, request:gr.Request):
|
68 |
+
|
69 |
+
hf_headers = {}
|
70 |
try:
|
71 |
+
if HF_SPACES[model]['is_zero_gpu_space']:
|
72 |
+
hf_headers = {"X-IP-Token": request.headers['x-ip-token']}
|
|
|
|
|
73 |
except:
|
74 |
+
pass
|
75 |
+
|
76 |
+
# re-attempt if necessary
|
77 |
+
attempt_count = 0
|
78 |
+
max_attempts = 1 # 3 =May cause 429 Too Many Request
|
79 |
+
while attempt_count < max_attempts:
|
80 |
+
try:
|
81 |
+
if model in AVAILABLE_MODELS:
|
82 |
+
if '/' in model:
|
83 |
+
# Use public HF Space
|
84 |
+
# if (model not in hf_clients):
|
85 |
+
# hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
|
86 |
+
mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
|
87 |
+
|
88 |
+
# print(f"{model}: Fetching endpoints of HF Space")
|
89 |
+
# assume the index is one of the first 9 return params
|
90 |
+
return_audio_index = int(HF_SPACES[model]['return_audio_index'])
|
91 |
+
endpoints = mdl_space.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
92 |
+
|
93 |
+
api_name = None
|
94 |
+
fn_index = None
|
95 |
+
end_parameters = None
|
96 |
+
# has named endpoint
|
97 |
+
if '/' == HF_SPACES[model]['function'][0]:
|
98 |
+
# audio sync function name
|
99 |
+
api_name = HF_SPACES[model]['function']
|
100 |
+
|
101 |
+
end_parameters = _get_param_examples(
|
102 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
103 |
+
)
|
104 |
+
# has unnamed endpoint
|
105 |
+
else:
|
106 |
+
# endpoint index is the first character
|
107 |
+
fn_index = int(HF_SPACES[model]['function'])
|
108 |
+
|
109 |
+
end_parameters = _get_param_examples(
|
110 |
+
endpoints['unnamed_endpoints'][str(fn_index)]['parameters']
|
111 |
+
)
|
112 |
+
|
113 |
+
# override some or all default parameters
|
114 |
+
space_inputs = _override_params(end_parameters, model)
|
115 |
+
|
116 |
+
# force text
|
117 |
+
space_inputs[HF_SPACES[model]['text_param_index']] = text
|
118 |
+
|
119 |
+
print(f"{model}: Sending request to HF Space")
|
120 |
+
results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
|
121 |
+
|
122 |
+
# return path to audio
|
123 |
+
result = results
|
124 |
+
if (not isinstance(results, str)):
|
125 |
+
# return_audio_index may be a filepath string
|
126 |
+
result = results[return_audio_index]
|
127 |
+
if (isinstance(result, dict)):
|
128 |
+
# return_audio_index is a dictionary
|
129 |
+
result = results[return_audio_index]['value']
|
130 |
+
else:
|
131 |
+
# Use the private HF Space
|
132 |
+
result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
133 |
+
else:
|
134 |
+
result = router.predict(text, model.lower(), api_name="/synthesize")
|
135 |
+
break
|
136 |
+
except Exception as e:
|
137 |
+
attempt_count += 1
|
138 |
+
raise gr.Error(f"{model}:"+ repr(e))
|
139 |
+
# print(f"{model}: Unable to call API (attempt: {attempt_count})")
|
140 |
+
# sleep for three seconds to avoid spamming the server with requests
|
141 |
+
# time.sleep(3)
|
142 |
+
|
143 |
+
# Fetch and store client again
|
144 |
+
# hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
|
145 |
+
|
146 |
+
if attempt_count >= max_attempts:
|
147 |
+
raise gr.Error(f"{model}: Failed to call model")
|
148 |
+
else:
|
149 |
+
print('Done with', model)
|
150 |
+
|
151 |
try:
|
152 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
153 |
audio = AudioSegment.from_file(result)
|
154 |
current_sr = audio.frame_rate
|
155 |
if current_sr > 24000:
|
156 |
+
print(f"{model}: Resampling")
|
157 |
audio = audio.set_frame_rate(24000)
|
158 |
try:
|
159 |
+
print(f"{model}: Trying to normalize audio")
|
160 |
audio = match_target_amplitude(audio, -20)
|
161 |
except:
|
162 |
+
print(f"{model}: [WARN] Unable to normalize audio")
|
163 |
audio.export(f.name, format="wav")
|
164 |
os.unlink(result)
|
165 |
result = f.name
|
166 |
+
gr.Info('Audio from a TTS model received')
|
167 |
except:
|
168 |
+
print(f"{model}: [WARN] Unable to resample audio")
|
169 |
pass
|
170 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
|
|
|
|
171 |
result_storage[model] = result
|
172 |
+
|
173 |
+
def _get_param_examples(parameters):
|
174 |
+
example_inputs = []
|
175 |
+
for param_info in parameters:
|
176 |
+
if (
|
177 |
+
param_info['component'] == 'Radio'
|
178 |
+
or param_info['component'] == 'Dropdown'
|
179 |
+
or param_info['component'] == 'Audio'
|
180 |
+
or param_info['python_type']['type'] == 'str'
|
181 |
+
):
|
182 |
+
example_inputs.append(str(param_info['example_input']))
|
183 |
+
continue
|
184 |
+
if param_info['python_type']['type'] == 'int':
|
185 |
+
example_inputs.append(int(param_info['example_input']))
|
186 |
+
continue
|
187 |
+
if param_info['python_type']['type'] == 'float':
|
188 |
+
example_inputs.append(float(param_info['example_input']))
|
189 |
+
continue
|
190 |
+
if param_info['python_type']['type'] == 'bool':
|
191 |
+
example_inputs.append(bool(param_info['example_input']))
|
192 |
+
continue
|
193 |
+
|
194 |
+
return example_inputs
|
195 |
+
|
196 |
+
def _override_params(inputs, modelname):
|
197 |
+
try:
|
198 |
+
for key,value in OVERRIDE_INPUTS[modelname].items():
|
199 |
+
inputs[key] = value
|
200 |
+
print(f"{modelname}: Default inputs overridden by Arena")
|
201 |
+
except:
|
202 |
+
pass
|
203 |
+
|
204 |
+
return inputs
|
205 |
+
|
206 |
mdl1k = mdl1
|
207 |
mdl2k = mdl2
|
208 |
print(mdl1k, mdl2k)
|
|
|
210 |
if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
|
211 |
results = {}
|
212 |
print(f"Sending models {mdl1k} and {mdl2k} to API")
|
213 |
+
|
214 |
+
# do not use multithreading when both spaces are ZeroGPU type
|
215 |
+
if (
|
216 |
+
# exists
|
217 |
+
'is_zero_gpu_space' in HF_SPACES[mdl1]
|
218 |
+
# is True
|
219 |
+
and HF_SPACES[mdl1]['is_zero_gpu_space']
|
220 |
+
and 'is_zero_gpu_space' in HF_SPACES[mdl2]
|
221 |
+
and HF_SPACES[mdl2]['is_zero_gpu_space']
|
222 |
+
):
|
223 |
+
# run Zero-GPU spaces one at a time
|
224 |
+
predict_and_update_result(text, mdl1k, results, request)
|
225 |
+
# _cache_sample(text, mdl1k)
|
226 |
+
|
227 |
+
predict_and_update_result(text, mdl2k, results, request)
|
228 |
+
# _cache_sample(text, mdl2k)
|
229 |
+
else:
|
230 |
+
# use multithreading
|
231 |
+
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
|
232 |
+
thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results, request))
|
233 |
+
|
234 |
+
thread1.start()
|
235 |
+
# wait 3 seconds to calm hf.space domain
|
236 |
+
time.sleep(3)
|
237 |
+
thread2.start()
|
238 |
+
# timeout in 2 minutes
|
239 |
+
thread1.join(120)
|
240 |
+
thread2.join(120)
|
241 |
+
|
242 |
+
# cache the result
|
243 |
+
# for model in [mdl1k, mdl2k]:
|
244 |
+
# _cache_sample(text, model)
|
245 |
+
|
246 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
247 |
return (
|
248 |
text,
|
|
|
333 |
print(f"Sending models {mdl1k} and {mdl2k} to API")
|
334 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
|
335 |
thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
|
336 |
+
|
337 |
thread1.start()
|
338 |
thread2.start()
|
339 |
thread1.join()
|