import numpy as np import onnxruntime as ort import torch import scipy.io.wavfile as wav import gradio as gr from huggingface_hub import hf_hub_download # Add this import # Download the ONNX model from Hugging Face Hub model_path = hf_hub_download( repo_id="onnx-community/Kokoro-82M-ONNX", filename="model.onnx", cache_dir="." ) # Load the ONNX model ort_session = ort.InferenceSession(model_path) # Define speaker options (replace with actual speaker IDs or embeddings) speaker_options = { "Speaker 1": "spk_1_embedding", "Speaker 2": "spk_2_embedding", "Speaker 3": "spk_3_embedding", } # Function to generate speech def generate_speech(text, speaker): # Preprocess the input text and speaker embedding input_text = np.array([text], dtype=np.str_) speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32) # Run the ONNX model ort_inputs = { "text": input_text, "speaker_embedding": speaker_embedding, } ort_outputs = ort_session.run(None, ort_inputs) # Postprocess the output (assuming the output is a waveform) waveform = ort_outputs[0].squeeze() # Save the waveform as a WAV file output_file = "output.wav" wav.write(output_file, 22050, waveform) # Adjust sample rate as needed return output_file # Gradio interface def tts_app(text, speaker): audio_file = generate_speech(text, speaker) return audio_file # Create the Gradio app iface = gr.Interface( fn=tts_app, inputs=[ gr.Textbox(label="Input Text"), gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"), ], outputs=gr.Audio(label="Generated Speech", type="filepath"), title="Text-to-Speech with Kokoro-82M-ONNX", description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.", ) # Launch the app iface.launch()