File size: 5,308 Bytes
ecc3537
2efd326
ecc3537
 
2efd326
 
 
ecc3537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2efd326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecc3537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2efd326
ecc3537
 
 
 
 
 
 
 
 
2efd326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecc3537
 
 
 
2efd326
 
ecc3537
 
 
 
 
 
 
 
 
 
2efd326
ecc3537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2efd326
ecc3537
2efd326
ecc3537
 
 
2efd326
 
 
 
 
 
 
 
 
 
 
ecc3537
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import io
import spaces
import tempfile
#import soundfile as sf
import numpy as np
from pydub import AudioSegment
import requests
from markdown import Markdown
from io import StringIO

import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown

md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
    "en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
    "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}

def numpy_to_mp3(audio_array, sampling_rate):
    # Normalize audio_array if it's floating-point
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
        audio_array = audio_array.astype(np.int16)

    # Create an audio segment from the numpy array
    audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sampling_rate,
        sample_width=audio_array.dtype.itemsize,
        channels=1
    )

    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
    mp3_io = io.BytesIO()
    audio_segment.export(mp3_io, format="mp3", bitrate="320k")

    # Get the MP3 bytes
    mp3_bytes = mp3_io.getvalue()
    mp3_io.close()

    return mp3_bytes

def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def markdown2text(text):
    return __md.convert(text)


def create_temp_html_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "output.html")
        with open(temp_path, "w") as f:
            f.write(html)   
    except Exception as e:
        raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
    return temp_path


def parse(input_type, url_input, file_input, text_input):
    if input_type in ["URL", "File"]:
        if input_type == "URL":
            filepath = create_temp_html_from_url(url_input)
        else:
            filepath = file_input
        print(filepath)
        markdown = md.convert(filepath).text_content
    else:
        markdown = text_input
    return markdown


def clean(output_markdown):
    return markdown2text(output_markdown)


@spaces.GPU
async def text_to_speech(output_text, voice, speed, lang):
    stream = kokoro.create_stream(
        output_text,
        voice=voice,
        speed=float(speed),
        lang=lang
    )
    async for samples, sample_rate in stream:
        yield numpy_to_mp3(samples, sampling_rate=sample_rate)


with gr.Blocks() as demo:
    gr.Markdown(
        "# Stream Local TTS with Kokoro-82M 🗣️\n"
        "Provide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
    )

    with gr.Row():
        with gr.Column():
            input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
            with gr.Row():
                speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
                lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
                voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
        with gr.Column():
            url_input = gr.Textbox(label="Enter URL", lines=1)
            file_input = gr.File(label="Upload File", visible=False)
            text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)

    def toggle_file_input(input_type):
        return gr.update(visible=(input_type == "File")), gr.update(
            visible=(input_type == "URL"),
        ), gr.update(visible=(input_type == "Custom Text"))

    def update_lang(lang):
        return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])

    input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
    lang.change(update_lang, lang, [voice])

    with gr.Accordion("Markdown output", open=False):
        output_text = gr.Textbox(visible=False)
        output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
    output_audio = gr.Audio(label="Generated Audio", streaming=True, autoplay=True, loop=False)
    submit_button = gr.Button("Convert")

    submit_button.click(
        parse,
        inputs=[input_type, url_input, file_input, text_input],
        outputs=[output_markdown],
    ).success(
        clean,
        inputs=[output_markdown],
        outputs=[output_text],
    ).success(
        text_to_speech,
        inputs=[output_text, voice, speed, lang],
        outputs=[output_audio],
    )

demo.launch()