Spaces:
Running
Running
mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -120,6 +120,14 @@ def infer(
|
|
120 |
speed=1,
|
121 |
show_info=gr.Info,
|
122 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
124 |
|
125 |
if model == "F5-TTS":
|
@@ -240,7 +248,7 @@ with gr.Blocks() as app_tts:
|
|
240 |
nfe_step=nfe_slider,
|
241 |
speed=speed_slider,
|
242 |
)
|
243 |
-
return audio_out, spectrogram_path,
|
244 |
|
245 |
generate_btn.click(
|
246 |
basic_tts,
|
@@ -320,7 +328,7 @@ with gr.Blocks() as app_multistyle:
|
|
320 |
)
|
321 |
|
322 |
# Regular speech type (mandatory)
|
323 |
-
with gr.Row():
|
324 |
with gr.Column():
|
325 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
326 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
@@ -329,12 +337,12 @@ with gr.Blocks() as app_multistyle:
|
|
329 |
|
330 |
# Regular speech type (max 100)
|
331 |
max_speech_types = 100
|
332 |
-
speech_type_rows = []
|
333 |
-
speech_type_names = [regular_name]
|
334 |
-
speech_type_audios = [regular_audio]
|
335 |
-
speech_type_ref_texts = [regular_ref_text]
|
336 |
-
speech_type_delete_btns = []
|
337 |
-
speech_type_insert_btns = [regular_insert]
|
338 |
|
339 |
# Additional speech types (99 more)
|
340 |
for i in range(max_speech_types - 1):
|
@@ -355,51 +363,32 @@ with gr.Blocks() as app_multistyle:
|
|
355 |
# Button to add speech type
|
356 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
357 |
|
358 |
-
# Keep track of
|
359 |
-
speech_type_count =
|
360 |
|
361 |
# Function to add a speech type
|
362 |
-
def add_speech_type_fn(
|
|
|
|
|
363 |
if speech_type_count < max_speech_types:
|
|
|
364 |
speech_type_count += 1
|
365 |
-
# Prepare updates for the rows
|
366 |
-
row_updates = []
|
367 |
-
for i in range(1, max_speech_types):
|
368 |
-
if i < speech_type_count:
|
369 |
-
row_updates.append(gr.update(visible=True))
|
370 |
-
else:
|
371 |
-
row_updates.append(gr.update())
|
372 |
else:
|
373 |
-
|
374 |
-
|
375 |
-
return [speech_type_count] + row_updates
|
376 |
|
377 |
-
add_speech_type_btn.click(
|
378 |
-
add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
|
379 |
-
)
|
380 |
|
381 |
# Function to delete a speech type
|
382 |
-
def
|
383 |
-
|
384 |
-
# Prepare updates
|
385 |
-
row_updates = []
|
386 |
-
|
387 |
-
for i in range(1, max_speech_types):
|
388 |
-
if i == index:
|
389 |
-
row_updates.append(gr.update(visible=False))
|
390 |
-
else:
|
391 |
-
row_updates.append(gr.update())
|
392 |
-
|
393 |
-
speech_type_count = max(1, speech_type_count)
|
394 |
-
|
395 |
-
return [speech_type_count] + row_updates
|
396 |
-
|
397 |
-
return delete_speech_type_fn
|
398 |
|
399 |
# Update delete button clicks
|
400 |
-
for i
|
401 |
-
|
402 |
-
|
|
|
|
|
403 |
|
404 |
# Text input for the prompt
|
405 |
gen_text_input_multistyle = gr.Textbox(
|
@@ -413,7 +402,7 @@ with gr.Blocks() as app_multistyle:
|
|
413 |
current_text = current_text or ""
|
414 |
speech_type_name = speech_type_name or "None"
|
415 |
updated_text = current_text + f"{{{speech_type_name}}} "
|
416 |
-
return
|
417 |
|
418 |
return insert_speech_type_fn
|
419 |
|
@@ -473,10 +462,14 @@ with gr.Blocks() as app_multistyle:
|
|
473 |
if style in speech_types:
|
474 |
current_style = style
|
475 |
else:
|
476 |
-
|
477 |
current_style = "Regular"
|
478 |
|
479 |
-
|
|
|
|
|
|
|
|
|
480 |
ref_text = speech_types[current_style].get("ref_text", "")
|
481 |
|
482 |
# Generate speech for this segment
|
@@ -491,12 +484,10 @@ with gr.Blocks() as app_multistyle:
|
|
491 |
# Concatenate all audio segments
|
492 |
if generated_audio_segments:
|
493 |
final_audio_data = np.concatenate(generated_audio_segments)
|
494 |
-
return [(sr, final_audio_data)] + [
|
495 |
-
gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
|
496 |
-
]
|
497 |
else:
|
498 |
gr.Warning("No audio generated.")
|
499 |
-
return [None] + [
|
500 |
|
501 |
generate_multistyle_btn.click(
|
502 |
generate_multistyle_speech,
|
@@ -514,7 +505,7 @@ with gr.Blocks() as app_multistyle:
|
|
514 |
|
515 |
# Validation function to disable Generate button if speech types are missing
|
516 |
def validate_speech_types(gen_text, regular_name, *args):
|
517 |
-
speech_type_names_list = args
|
518 |
|
519 |
# Collect the speech types names
|
520 |
speech_types_available = set()
|
@@ -678,7 +669,7 @@ Have a conversation with an AI using your reference voice!
|
|
678 |
speed=1.0,
|
679 |
show_info=print, # show_info=print no pull to top when generating
|
680 |
)
|
681 |
-
return audio_result,
|
682 |
|
683 |
def clear_conversation():
|
684 |
"""Reset the conversation"""
|
@@ -828,7 +819,10 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
828 |
visible=False,
|
829 |
)
|
830 |
custom_model_cfg = gr.Dropdown(
|
831 |
-
choices=[
|
|
|
|
|
|
|
832 |
value=load_last_used_custom()[2],
|
833 |
allow_custom_value=True,
|
834 |
label="Config: in a dictionary form",
|
|
|
120 |
speed=1,
|
121 |
show_info=gr.Info,
|
122 |
):
|
123 |
+
if not ref_audio_orig:
|
124 |
+
gr.Warning("Please provide reference audio.")
|
125 |
+
return gr.update(), gr.update(), ref_text
|
126 |
+
|
127 |
+
if not gen_text.strip():
|
128 |
+
gr.Warning("Please enter text to generate.")
|
129 |
+
return gr.update(), gr.update(), ref_text
|
130 |
+
|
131 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
132 |
|
133 |
if model == "F5-TTS":
|
|
|
248 |
nfe_step=nfe_slider,
|
249 |
speed=speed_slider,
|
250 |
)
|
251 |
+
return audio_out, spectrogram_path, ref_text_out
|
252 |
|
253 |
generate_btn.click(
|
254 |
basic_tts,
|
|
|
328 |
)
|
329 |
|
330 |
# Regular speech type (mandatory)
|
331 |
+
with gr.Row() as regular_row:
|
332 |
with gr.Column():
|
333 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
334 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
|
|
337 |
|
338 |
# Regular speech type (max 100)
|
339 |
max_speech_types = 100
|
340 |
+
speech_type_rows = [regular_row]
|
341 |
+
speech_type_names = [regular_name]
|
342 |
+
speech_type_audios = [regular_audio]
|
343 |
+
speech_type_ref_texts = [regular_ref_text]
|
344 |
+
speech_type_delete_btns = [None]
|
345 |
+
speech_type_insert_btns = [regular_insert]
|
346 |
|
347 |
# Additional speech types (99 more)
|
348 |
for i in range(max_speech_types - 1):
|
|
|
363 |
# Button to add speech type
|
364 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
365 |
|
366 |
+
# Keep track of autoincrement of speech types, no roll back
|
367 |
+
speech_type_count = 1
|
368 |
|
369 |
# Function to add a speech type
|
370 |
+
def add_speech_type_fn():
|
371 |
+
row_updates = [gr.update() for _ in range(max_speech_types)]
|
372 |
+
global speech_type_count
|
373 |
if speech_type_count < max_speech_types:
|
374 |
+
row_updates[speech_type_count] = gr.update(visible=True)
|
375 |
speech_type_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
else:
|
377 |
+
gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
|
378 |
+
return row_updates
|
|
|
379 |
|
380 |
+
add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
|
|
|
|
|
381 |
|
382 |
# Function to delete a speech type
|
383 |
+
def delete_speech_type_fn():
|
384 |
+
return gr.update(visible=False), None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
# Update delete button clicks
|
387 |
+
for i in range(1, len(speech_type_delete_btns)):
|
388 |
+
speech_type_delete_btns[i].click(
|
389 |
+
delete_speech_type_fn,
|
390 |
+
outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
|
391 |
+
)
|
392 |
|
393 |
# Text input for the prompt
|
394 |
gen_text_input_multistyle = gr.Textbox(
|
|
|
402 |
current_text = current_text or ""
|
403 |
speech_type_name = speech_type_name or "None"
|
404 |
updated_text = current_text + f"{{{speech_type_name}}} "
|
405 |
+
return updated_text
|
406 |
|
407 |
return insert_speech_type_fn
|
408 |
|
|
|
462 |
if style in speech_types:
|
463 |
current_style = style
|
464 |
else:
|
465 |
+
gr.Warning(f"Type {style} is not available, will use Regular as default.")
|
466 |
current_style = "Regular"
|
467 |
|
468 |
+
try:
|
469 |
+
ref_audio = speech_types[current_style]["audio"]
|
470 |
+
except KeyError:
|
471 |
+
gr.Warning(f"Please provide reference audio for type {current_style}.")
|
472 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
473 |
ref_text = speech_types[current_style].get("ref_text", "")
|
474 |
|
475 |
# Generate speech for this segment
|
|
|
484 |
# Concatenate all audio segments
|
485 |
if generated_audio_segments:
|
486 |
final_audio_data = np.concatenate(generated_audio_segments)
|
487 |
+
return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
|
|
|
|
|
488 |
else:
|
489 |
gr.Warning("No audio generated.")
|
490 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
491 |
|
492 |
generate_multistyle_btn.click(
|
493 |
generate_multistyle_speech,
|
|
|
505 |
|
506 |
# Validation function to disable Generate button if speech types are missing
|
507 |
def validate_speech_types(gen_text, regular_name, *args):
|
508 |
+
speech_type_names_list = args
|
509 |
|
510 |
# Collect the speech types names
|
511 |
speech_types_available = set()
|
|
|
669 |
speed=1.0,
|
670 |
show_info=print, # show_info=print no pull to top when generating
|
671 |
)
|
672 |
+
return audio_result, ref_text_out
|
673 |
|
674 |
def clear_conversation():
|
675 |
"""Reset the conversation"""
|
|
|
819 |
visible=False,
|
820 |
)
|
821 |
custom_model_cfg = gr.Dropdown(
|
822 |
+
choices=[
|
823 |
+
DEFAULT_TTS_MODEL_CFG[2],
|
824 |
+
json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
|
825 |
+
],
|
826 |
value=load_last_used_custom()[2],
|
827 |
allow_custom_value=True,
|
828 |
label="Config: in a dictionary form",
|