File size: 1,942 Bytes
41ad754
5cb3ee9
d8238c0
aa3f3ad
 
0431ea7
d8238c0
aa3f3ad
41ad754
d8238c0
aa3f3ad
 
d8238c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import sys
from fastapi import Request
import gradio as gr
from TTS.api import TTS

# Agree to Coqui TTS terms of service
os.environ["COQUI_TOS_AGREED"] = "1"

# Initialize TTS with the desired model
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)
tts.to("cpu")  # Use CPU for inference

def predict(prompt, language, audio_file_path, use_mic, agree):
    if not agree:
        return "You must agree to the Terms & Condition!", None
    
    if use_mic and not audio_file_path:
        return "Please provide a microphone recording or disable the 'Use Microphone' option.", None
    
    if len(prompt) < 2 or len(prompt) > 50000:
        return "Prompt text length must be between 2 and 50000 characters.", None

    speaker_wav = audio_file_path
    output_path = "output.wav"
    
    try:
        tts.tts_to_file(text=prompt, file_path=output_path, speaker_wav=speaker_wav, language=language)
    except Exception as e:
        print(f"Error during TTS generation: {e}", file=sys.stderr)
        return "An error occurred during TTS generation.", None

    return gr.Audio(file_path=output_path), output_path

iface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Text Prompt"),
        gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en"),
        gr.Audio(label="Reference Audio", type="filepath"),
        gr.Checkbox(label="Use Microphone as Reference", value=False),
        gr.Checkbox(label="Agree to Terms & Conditions", value=True),
    ],
    outputs=[gr.Audio(label="Synthesised Audio"), "text"],
    title="XTTS Text-to-Speech",
    description="A web interface for Coqui's TTS model to generate speech from text.",
    examples=[
        # Example inputs
        ["Hello, World !", "en", "path/to/example_audio.wav", False, True],
    ]
)

iface.launch(debug=True)