Kokoro-82M TTS Engine

import gradio as gr
from kokoro import generate
from models import build_model
from scipy.io.wavfile import write
from pydub import AudioSegment
import torch
import numpy as np
import os
import shortuuid

# Load model and voicepack only once
MODEL_PATH = 'kokoro-v0_19.pth'
MODEL = None
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.")

if MODEL is None:
    MODEL = build_model(MODEL_PATH, 'cpu')
    print("\n-------------\nModel loaded.")

VOICE_NAMES = [
    'af',  # Default voice is a 50-50 mix of Bella & Sarah
    'af_bella', 'af_sarah', 'am_adam', 'am_michael',
    'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
    'af_nicole', 'af_sky',
]

def text_to_speech(text, voice_name, output_folder):
    if voice_name not in VOICE_NAMES:
        return None, "Invalid voice name."

    # Load the selected voicepack
    voicepack_path = f'voices/{voice_name}.pt'
    if not os.path.exists(voicepack_path):
        return None, f"Voicepack '{voice_name}' not found."
    
    VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu')
    print(f'Loaded voice: {voice_name}')

    # Generate audio
    audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0])

    # Normalize and scale audio data
    audio_data = np.array(audio_data)
    normalized_audio = audio_data / np.max(np.abs(audio_data))
    scaled_audio = np.int16(normalized_audio * 32767)

    # Save files
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav"
    write(wav_path, 24000, scaled_audio)

    return wav_path, f"Audio saved at: {wav_path}"

# Gradio Blocks implementation
with gr.Blocks(theme='gradio/soft') as app:
    gr.Markdown(
        """
            <h1 align="center">Kokoro-82M TTS Engine</h1>
            <h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4>
        """
    )

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Text to Convert")
            voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice")
            output_folder_input = gr.Textbox(label="Output Folder", value="./outputs")
            submit_button = gr.Button("Generate")
        with gr.Column():
       	    audio_output = gr.Audio(label="Generated Audio", type="filepath")
            status_output = gr.Textbox(label="Status", interactive=False)

    def process_text_to_speech(text, voice_name, output_folder):
        wav_path, status_message = text_to_speech(text, voice_name, output_folder)
        return wav_path, status_message

    submit_button.click(
        fn=process_text_to_speech,
            inputs=[text_input, voice_selector, output_folder_input],
        outputs=[audio_output, status_output]
    )


if __name__ == "__main__":
    app.launch()