|
import gradio as gr |
|
from kokoro import generate |
|
from models import build_model |
|
from scipy.io.wavfile import write |
|
from pydub import AudioSegment |
|
import torch |
|
import numpy as np |
|
import os |
|
import shortuuid |
|
|
|
|
|
MODEL_PATH = 'kokoro-v0_19.pth' |
|
MODEL = None |
|
if not os.path.exists(MODEL_PATH): |
|
raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.") |
|
|
|
if MODEL is None: |
|
MODEL = build_model(MODEL_PATH, 'cpu') |
|
print("\n-------------\nModel loaded.") |
|
|
|
VOICE_NAMES = [ |
|
'af', |
|
'af_bella', 'af_sarah', 'am_adam', 'am_michael', |
|
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis', |
|
'af_nicole', 'af_sky', |
|
] |
|
|
|
def text_to_speech(text, voice_name, output_folder): |
|
if voice_name not in VOICE_NAMES: |
|
return None, "Invalid voice name." |
|
|
|
|
|
voicepack_path = f'voices/{voice_name}.pt' |
|
if not os.path.exists(voicepack_path): |
|
return None, f"Voicepack '{voice_name}' not found." |
|
|
|
VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu') |
|
print(f'Loaded voice: {voice_name}') |
|
|
|
|
|
audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0]) |
|
|
|
|
|
audio_data = np.array(audio_data) |
|
normalized_audio = audio_data / np.max(np.abs(audio_data)) |
|
scaled_audio = np.int16(normalized_audio * 32767) |
|
|
|
|
|
if not os.path.exists(output_folder): |
|
os.makedirs(output_folder) |
|
|
|
wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav" |
|
write(wav_path, 24000, scaled_audio) |
|
|
|
return wav_path, f"Audio saved at: {wav_path}" |
|
|
|
|
|
with gr.Blocks(theme='gradio/soft') as app: |
|
gr.Markdown( |
|
""" |
|
<h1 align="center">Kokoro-82M TTS Engine</h1> |
|
<h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox(label="Text to Convert") |
|
voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice") |
|
output_folder_input = gr.Textbox(label="Output Folder", value="./outputs") |
|
submit_button = gr.Button("Generate") |
|
with gr.Column(): |
|
audio_output = gr.Audio(label="Generated Audio", type="filepath") |
|
status_output = gr.Textbox(label="Status", interactive=False) |
|
|
|
def process_text_to_speech(text, voice_name, output_folder): |
|
wav_path, status_message = text_to_speech(text, voice_name, output_folder) |
|
return wav_path, status_message |
|
|
|
submit_button.click( |
|
fn=process_text_to_speech, |
|
inputs=[text_input, voice_selector, output_folder_input], |
|
outputs=[audio_output, status_output] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |