import gradio as gr from kokoro import generate from models import build_model from scipy.io.wavfile import write from pydub import AudioSegment import torch import numpy as np import os import shortuuid # Load model and voicepack only once MODEL_PATH = 'kokoro-v0_19.pth' MODEL = None if not os.path.exists(MODEL_PATH): raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.") if MODEL is None: MODEL = build_model(MODEL_PATH, 'cpu') print("\n-------------\nModel loaded.") VOICE_NAMES = [ 'af', # Default voice is a 50-50 mix of Bella & Sarah 'af_bella', 'af_sarah', 'am_adam', 'am_michael', 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis', 'af_nicole', 'af_sky', ] def text_to_speech(text, voice_name, output_folder): if voice_name not in VOICE_NAMES: return None, "Invalid voice name." # Load the selected voicepack voicepack_path = f'voices/{voice_name}.pt' if not os.path.exists(voicepack_path): return None, f"Voicepack '{voice_name}' not found." VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu') print(f'Loaded voice: {voice_name}') # Generate audio audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0]) # Normalize and scale audio data audio_data = np.array(audio_data) normalized_audio = audio_data / np.max(np.abs(audio_data)) scaled_audio = np.int16(normalized_audio * 32767) # Save files if not os.path.exists(output_folder): os.makedirs(output_folder) wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav" write(wav_path, 24000, scaled_audio) return wav_path, f"Audio saved at: {wav_path}" # Gradio Blocks implementation with gr.Blocks(theme='gradio/soft') as app: gr.Markdown( """

Kokoro-82M TTS Engine

A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio

""" ) with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Text to Convert") voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice") output_folder_input = gr.Textbox(label="Output Folder", value="./outputs") submit_button = gr.Button("Generate") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", type="filepath") status_output = gr.Textbox(label="Status", interactive=False) def process_text_to_speech(text, voice_name, output_folder): wav_path, status_message = text_to_speech(text, voice_name, output_folder) return wav_path, status_message submit_button.click( fn=process_text_to_speech, inputs=[text_input, voice_selector, output_folder_input], outputs=[audio_output, status_output] ) if __name__ == "__main__": app.launch()