File size: 2,980 Bytes
83a55ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
from kokoro import generate
from models import build_model
from scipy.io.wavfile import write
from pydub import AudioSegment
import torch
import numpy as np
import os
import shortuuid
# Load model and voicepack only once
MODEL_PATH = 'kokoro-v0_19.pth'
MODEL = None
if not os.path.exists(MODEL_PATH):
raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.")
if MODEL is None:
MODEL = build_model(MODEL_PATH, 'cpu')
print("\n-------------\nModel loaded.")
VOICE_NAMES = [
'af', # Default voice is a 50-50 mix of Bella & Sarah
'af_bella', 'af_sarah', 'am_adam', 'am_michael',
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
'af_nicole', 'af_sky',
]
def text_to_speech(text, voice_name, output_folder):
if voice_name not in VOICE_NAMES:
return None, "Invalid voice name."
# Load the selected voicepack
voicepack_path = f'voices/{voice_name}.pt'
if not os.path.exists(voicepack_path):
return None, f"Voicepack '{voice_name}' not found."
VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu')
print(f'Loaded voice: {voice_name}')
# Generate audio
audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0])
# Normalize and scale audio data
audio_data = np.array(audio_data)
normalized_audio = audio_data / np.max(np.abs(audio_data))
scaled_audio = np.int16(normalized_audio * 32767)
# Save files
if not os.path.exists(output_folder):
os.makedirs(output_folder)
wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav"
write(wav_path, 24000, scaled_audio)
return wav_path, f"Audio saved at: {wav_path}"
# Gradio Blocks implementation
with gr.Blocks(theme='gradio/soft') as app:
gr.Markdown(
"""
<h1 align="center">Kokoro-82M TTS Engine</h1>
<h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4>
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Text to Convert")
voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice")
output_folder_input = gr.Textbox(label="Output Folder", value="./outputs")
submit_button = gr.Button("Generate")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="filepath")
status_output = gr.Textbox(label="Status", interactive=False)
def process_text_to_speech(text, voice_name, output_folder):
wav_path, status_message = text_to_speech(text, voice_name, output_folder)
return wav_path, status_message
submit_button.click(
fn=process_text_to_speech,
inputs=[text_input, voice_selector, output_folder_input],
outputs=[audio_output, status_output]
)
if __name__ == "__main__":
app.launch() |