import os import sys from fastapi import Request import gradio as gr from TTS.api import TTS # Agree to Coqui TTS terms of service os.environ["COQUI_TOS_AGREED"] = "1" # Initialize TTS with the desired model model_name = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name, gpu=False) tts.to("cpu") # Use CPU for inference def predict(prompt, language, audio_file_path, use_mic, agree): if not agree: return "You must agree to the Terms & Condition!", None if use_mic and not audio_file_path: return "Please provide a microphone recording or disable the 'Use Microphone' option.", None if len(prompt) < 2 or len(prompt) > 50000: return "Prompt text length must be between 2 and 50000 characters.", None speaker_wav = audio_file_path output_path = "output.wav" try: tts.tts_to_file(text=prompt, file_path=output_path, speaker_wav=speaker_wav, language=language) except Exception as e: print(f"Error during TTS generation: {e}", file=sys.stderr) return "An error occurred during TTS generation.", None return gr.Audio(file_path=output_path), output_path iface = gr.Interface( fn=predict, inputs=[ gr.Textbox(label="Text Prompt"), gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en"), gr.Audio(label="Reference Audio", type="filepath"), gr.Checkbox(label="Use Microphone as Reference", value=False), gr.Checkbox(label="Agree to Terms & Conditions", value=True), ], outputs=[gr.Audio(label="Synthesised Audio"), "text"], title="XTTS Text-to-Speech", description="A web interface for Coqui's TTS model to generate speech from text.", examples=[ # Example inputs ["Hello, World !", "en", "path/to/example_audio.wav", False, True], ] ) iface.launch(debug=True)