kkr2 / app.py
hivecorp's picture
Update app.py
454da09 verified
raw
history blame
1.7 kB
import numpy as np
import onnxruntime as ort
import torch
import scipy.io.wavfile as wav
import gradio as gr
# Load the ONNX model
model_path = "Kokoro-82M-ONNX/model.onnx"
ort_session = ort.InferenceSession(model_path)
# Define speaker options (replace with actual speaker IDs or embeddings)
speaker_options = {
"Speaker 1": "spk_1_embedding",
"Speaker 2": "spk_2_embedding",
"Speaker 3": "spk_3_embedding",
}
# Function to generate speech
def generate_speech(text, speaker):
# Preprocess the input text and speaker embedding
input_text = np.array([text], dtype=np.str_)
speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32)
# Run the ONNX model
ort_inputs = {
"text": input_text,
"speaker_embedding": speaker_embedding,
}
ort_outputs = ort_session.run(None, ort_inputs)
# Postprocess the output (assuming the output is a waveform)
waveform = ort_outputs[0].squeeze()
# Save the waveform as a WAV file
output_file = "output.wav"
wav.write(output_file, 22050, waveform) # Adjust sample rate as needed
return output_file
# Gradio interface
def tts_app(text, speaker):
audio_file = generate_speech(text, speaker)
return audio_file
# Create the Gradio app
iface = gr.Interface(
fn=tts_app,
inputs=[
gr.Textbox(label="Input Text"),
gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"),
],
outputs=gr.Audio(label="Generated Speech", type="filepath"),
title="Text-to-Speech with Kokoro-82M-ONNX",
description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.",
)
# Launch the app
iface.launch()