Spaces:

hivecorp
/

kkr2

Runtime error

File size: 1,895 Bytes

454da09
 
 
 
fc1fe4c
897a296
 
 
 
 
 
 
 
fc1fe4c
454da09
 
 
 
 
 
 
 
8a61d62
 
454da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc1fe4c
454da09
fc1fe4c
454da09
8a61d62
454da09
 
8a61d62
454da09
 
 
fc1fe4c
 
454da09

import numpy as np
import onnxruntime as ort
import torch
import scipy.io.wavfile as wav
import gradio as gr
from huggingface_hub import hf_hub_download  # Add this import

# Download the ONNX model from Hugging Face Hub
model_path = hf_hub_download(
    repo_id="onnx-community/Kokoro-82M-ONNX",
    filename="model.onnx",
    cache_dir="."
)

# Load the ONNX model
ort_session = ort.InferenceSession(model_path)

# Define speaker options (replace with actual speaker IDs or embeddings)
speaker_options = {
    "Speaker 1": "spk_1_embedding",
    "Speaker 2": "spk_2_embedding",
    "Speaker 3": "spk_3_embedding",
}

# Function to generate speech
def generate_speech(text, speaker):
    # Preprocess the input text and speaker embedding
    input_text = np.array([text], dtype=np.str_)
    speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32)

    # Run the ONNX model
    ort_inputs = {
        "text": input_text,
        "speaker_embedding": speaker_embedding,
    }
    ort_outputs = ort_session.run(None, ort_inputs)

    # Postprocess the output (assuming the output is a waveform)
    waveform = ort_outputs[0].squeeze()

    # Save the waveform as a WAV file
    output_file = "output.wav"
    wav.write(output_file, 22050, waveform)  # Adjust sample rate as needed

    return output_file

# Gradio interface
def tts_app(text, speaker):
    audio_file = generate_speech(text, speaker)
    return audio_file

# Create the Gradio app
iface = gr.Interface(
    fn=tts_app,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"),
    ],
    outputs=gr.Audio(label="Generated Speech", type="filepath"),
    title="Text-to-Speech with Kokoro-82M-ONNX",
    description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.",
)

# Launch the app
iface.launch()