edwko's picture
Create app.py
bf642c7 verified
raw
history blame
5.01 kB
import os
import gradio as gr
import outetts
from outetts.version.v1.interface import _DEFAULT_SPEAKERS
model_config = outetts.HFModelConfig_v1(
model_path="OuteAI/OuteTTS-0.2-500M",
language="en",
)
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)
def get_available_speakers(language):
"""Get available speakers for the selected language."""
if language not in interface.languages:
return []
speakers = list(_DEFAULT_SPEAKERS[language].keys())
speakers.insert(0, "None")
return speakers
def change_interface_language(language):
"""Change interface language and update available speakers."""
try:
interface.change_language(language)
speakers = get_available_speakers(language)
return gr.update(choices=speakers, value="male_1"), gr.update(visible=True)
except ValueError as e:
return gr.update(choices=["None"], value="None"), gr.update(visible=False)
def generate_tts(
text, temperature, repetition_penalty, language,
speaker_selection, reference_audio, reference_text
):
"""Generate TTS with error handling and new features."""
try:
# Validate inputs for custom speaker
if reference_audio and reference_text:
if not os.path.exists(reference_audio):
raise ValueError("Reference audio file not found")
if not reference_text.strip():
raise ValueError("Reference transcription text is required")
speaker = interface.create_speaker(reference_audio, reference_text)
# Use selected default speaker
elif speaker_selection and speaker_selection != "None":
speaker = interface.load_default_speaker(speaker_selection)
# No speaker - random characteristics
else:
speaker = None
# Generate audio
output = interface.generate(
text=text,
speaker=speaker,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_length=4096
)
# Verify output
if output.audio is None:
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
# Save and return output
output_path = "output.wav"
output.save(output_path)
return output_path, None
except Exception as e:
return None, str(e)
with gr.Blocks() as demo:
gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo")
error_box = gr.Textbox(label="Error Messages", visible=False)
with gr.Row():
with gr.Column():
# Language selection
language_dropdown = gr.Dropdown(
choices=list(interface.languages),
value="en",
label="Interface Language"
)
# Speaker selection
speaker_dropdown = gr.Dropdown(
choices=get_available_speakers("en"),
value="male_1",
label="Speaker Selection"
)
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here..."
)
temperature = gr.Slider(
0.1, 1.0,
value=0.1,
label="Temperature (lower = more stable tone, higher = more expressive)"
)
repetition_penalty = gr.Slider(
0.5, 2.0,
value=1.1,
label="Repetition Penalty"
)
gr.Markdown("""
### Voice Cloning Guidelines:
- Use 10-15 seconds of clear, noise-free audio
- Provide accurate transcription
- Longer audio clips will reduce maximum output length
- Custom speaker overrides speaker selection
""")
reference_audio = gr.Audio(
label="Reference Audio (for voice cloning)",
type="filepath"
)
reference_text = gr.Textbox(
label="Reference Transcription Text",
placeholder="Enter exact transcription of reference audio"
)
submit_button = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
language_dropdown.change(
fn=change_interface_language,
inputs=[language_dropdown],
outputs=[speaker_dropdown, speaker_dropdown]
)
submit_button.click(
fn=generate_tts,
inputs=[
text_input,
temperature,
repetition_penalty,
language_dropdown,
speaker_dropdown,
reference_audio,
reference_text
],
outputs=[audio_output, error_box]
).then(
fn=lambda x: gr.update(visible=bool(x)),
inputs=[error_box],
outputs=[error_box]
)
demo.launch()