File size: 1,895 Bytes
454da09
 
 
 
fc1fe4c
897a296
 
 
 
 
 
 
 
fc1fe4c
454da09
 
 
 
 
 
 
 
8a61d62
 
454da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc1fe4c
454da09
fc1fe4c
454da09
8a61d62
454da09
 
8a61d62
454da09
 
 
fc1fe4c
 
454da09
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import onnxruntime as ort
import torch
import scipy.io.wavfile as wav
import gradio as gr
from huggingface_hub import hf_hub_download  # Add this import

# Download the ONNX model from Hugging Face Hub
model_path = hf_hub_download(
    repo_id="onnx-community/Kokoro-82M-ONNX",
    filename="model.onnx",
    cache_dir="."
)

# Load the ONNX model
ort_session = ort.InferenceSession(model_path)

# Define speaker options (replace with actual speaker IDs or embeddings)
speaker_options = {
    "Speaker 1": "spk_1_embedding",
    "Speaker 2": "spk_2_embedding",
    "Speaker 3": "spk_3_embedding",
}

# Function to generate speech
def generate_speech(text, speaker):
    # Preprocess the input text and speaker embedding
    input_text = np.array([text], dtype=np.str_)
    speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32)

    # Run the ONNX model
    ort_inputs = {
        "text": input_text,
        "speaker_embedding": speaker_embedding,
    }
    ort_outputs = ort_session.run(None, ort_inputs)

    # Postprocess the output (assuming the output is a waveform)
    waveform = ort_outputs[0].squeeze()

    # Save the waveform as a WAV file
    output_file = "output.wav"
    wav.write(output_file, 22050, waveform)  # Adjust sample rate as needed

    return output_file

# Gradio interface
def tts_app(text, speaker):
    audio_file = generate_speech(text, speaker)
    return audio_file

# Create the Gradio app
iface = gr.Interface(
    fn=tts_app,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"),
    ],
    outputs=gr.Audio(label="Generated Speech", type="filepath"),
    title="Text-to-Speech with Kokoro-82M-ONNX",
    description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.",
)

# Launch the app
iface.launch()