|
import numpy as np |
|
import onnxruntime as ort |
|
import torch |
|
import scipy.io.wavfile as wav |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
model_path = hf_hub_download( |
|
repo_id="onnx-community/Kokoro-82M-ONNX", |
|
filename="model.onnx", |
|
cache_dir="." |
|
) |
|
|
|
|
|
ort_session = ort.InferenceSession(model_path) |
|
|
|
|
|
speaker_options = { |
|
"Speaker 1": "spk_1_embedding", |
|
"Speaker 2": "spk_2_embedding", |
|
"Speaker 3": "spk_3_embedding", |
|
} |
|
|
|
|
|
def generate_speech(text, speaker): |
|
|
|
input_text = np.array([text], dtype=np.str_) |
|
speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32) |
|
|
|
|
|
ort_inputs = { |
|
"text": input_text, |
|
"speaker_embedding": speaker_embedding, |
|
} |
|
ort_outputs = ort_session.run(None, ort_inputs) |
|
|
|
|
|
waveform = ort_outputs[0].squeeze() |
|
|
|
|
|
output_file = "output.wav" |
|
wav.write(output_file, 22050, waveform) |
|
|
|
return output_file |
|
|
|
|
|
def tts_app(text, speaker): |
|
audio_file = generate_speech(text, speaker) |
|
return audio_file |
|
|
|
|
|
iface = gr.Interface( |
|
fn=tts_app, |
|
inputs=[ |
|
gr.Textbox(label="Input Text"), |
|
gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"), |
|
], |
|
outputs=gr.Audio(label="Generated Speech", type="filepath"), |
|
title="Text-to-Speech with Kokoro-82M-ONNX", |
|
description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.", |
|
) |
|
|
|
|
|
iface.launch() |