|
import os |
|
import gradio as gr |
|
import outetts |
|
from outetts.version.v1.interface import _DEFAULT_SPEAKERS |
|
|
|
model_config = outetts.HFModelConfig_v1( |
|
model_path="OuteAI/OuteTTS-0.2-500M", |
|
language="en", |
|
) |
|
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) |
|
|
|
def get_available_speakers(language): |
|
"""Get available speakers for the selected language.""" |
|
if language not in interface.languages: |
|
return [] |
|
speakers = list(_DEFAULT_SPEAKERS[language].keys()) |
|
speakers.insert(0, "None") |
|
return speakers |
|
|
|
def change_interface_language(language): |
|
"""Change interface language and update available speakers.""" |
|
try: |
|
interface.change_language(language) |
|
speakers = get_available_speakers(language) |
|
return gr.update(choices=speakers, value="male_1"), gr.update(visible=True) |
|
except ValueError as e: |
|
return gr.update(choices=["None"], value="None"), gr.update(visible=False) |
|
|
|
def generate_tts( |
|
text, temperature, repetition_penalty, language, |
|
speaker_selection, reference_audio, reference_text |
|
): |
|
"""Generate TTS with error handling and new features.""" |
|
try: |
|
|
|
if reference_audio and reference_text: |
|
if not os.path.exists(reference_audio): |
|
raise ValueError("Reference audio file not found") |
|
if not reference_text.strip(): |
|
raise ValueError("Reference transcription text is required") |
|
speaker = interface.create_speaker(reference_audio, reference_text) |
|
|
|
|
|
elif speaker_selection and speaker_selection != "None": |
|
speaker = interface.load_default_speaker(speaker_selection) |
|
|
|
|
|
else: |
|
speaker = None |
|
|
|
|
|
output = interface.generate( |
|
text=text, |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_length=4096 |
|
) |
|
|
|
|
|
if output.audio is None: |
|
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") |
|
|
|
|
|
output_path = "output.wav" |
|
output.save(output_path) |
|
return output_path, None |
|
|
|
except Exception as e: |
|
return None, str(e) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo") |
|
|
|
error_box = gr.Textbox(label="Error Messages", visible=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
language_dropdown = gr.Dropdown( |
|
choices=list(interface.languages), |
|
value="en", |
|
label="Interface Language" |
|
) |
|
|
|
|
|
speaker_dropdown = gr.Dropdown( |
|
choices=get_available_speakers("en"), |
|
value="male_1", |
|
label="Speaker Selection" |
|
) |
|
|
|
text_input = gr.Textbox( |
|
label="Text to Synthesize", |
|
placeholder="Enter text here..." |
|
) |
|
|
|
temperature = gr.Slider( |
|
0.1, 1.0, |
|
value=0.1, |
|
label="Temperature (lower = more stable tone, higher = more expressive)" |
|
) |
|
|
|
repetition_penalty = gr.Slider( |
|
0.5, 2.0, |
|
value=1.1, |
|
label="Repetition Penalty" |
|
) |
|
|
|
gr.Markdown(""" |
|
### Voice Cloning Guidelines: |
|
- Use 10-15 seconds of clear, noise-free audio |
|
- Provide accurate transcription |
|
- Longer audio clips will reduce maximum output length |
|
- Custom speaker overrides speaker selection |
|
""") |
|
|
|
reference_audio = gr.Audio( |
|
label="Reference Audio (for voice cloning)", |
|
type="filepath" |
|
) |
|
|
|
reference_text = gr.Textbox( |
|
label="Reference Transcription Text", |
|
placeholder="Enter exact transcription of reference audio" |
|
) |
|
|
|
submit_button = gr.Button("Generate Speech") |
|
|
|
with gr.Column(): |
|
audio_output = gr.Audio( |
|
label="Generated Audio", |
|
type="filepath" |
|
) |
|
|
|
language_dropdown.change( |
|
fn=change_interface_language, |
|
inputs=[language_dropdown], |
|
outputs=[speaker_dropdown, speaker_dropdown] |
|
) |
|
|
|
submit_button.click( |
|
fn=generate_tts, |
|
inputs=[ |
|
text_input, |
|
temperature, |
|
repetition_penalty, |
|
language_dropdown, |
|
speaker_dropdown, |
|
reference_audio, |
|
reference_text |
|
], |
|
outputs=[audio_output, error_box] |
|
).then( |
|
fn=lambda x: gr.update(visible=bool(x)), |
|
inputs=[error_box], |
|
outputs=[error_box] |
|
) |
|
|
|
demo.launch() |