File size: 4,464 Bytes
ecc3537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import spaces
import tempfile
import soundfile as sf
import requests
from markdown import Markdown
from io import StringIO

import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown

md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
    "en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
    "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}

def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def markdown2text(text):
    return __md.convert(text)


@spaces.GPU
def text_to_speech(text, voice, speed, lang):
    try:
        # Generate audio
        samples, sample_rate = kokoro.create(
            text,
            voice=voice,
            speed=float(speed),
            lang=lang
        )
        
        # Create temporary file
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "output.wav")
        
        # Save to temporary file
        sf.write(temp_path, samples, sample_rate)
        return temp_path
    except Exception as e:
        return f"Error: {str(e)}"


def create_temp_html_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "output.html")
        
        with open(temp_path, "w") as f:
            f.write(html)   
    except Exception as e:
        raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
    return temp_path


def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
    if input_type in ["URL", "File"]:
        if input_type == "URL":
            filepath = create_temp_html_from_url(url_input)
        else:
            filepath = file_input
        print(filepath)
        markdown = md.convert(filepath).text_content
        text = markdown2text(markdown)
    else:
        markdown = text_input
        text = text_input
    audio_path = text_to_speech(text, voice, speed, lang)
    return markdown, audio_path


with gr.Blocks() as demo:
    gr.Markdown(
        "# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
    )

    with gr.Row():
        with gr.Column():
            input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
            with gr.Row():
                speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
                lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
                voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
        with gr.Column():
            url_input = gr.Textbox(label="Enter URL")
            file_input = gr.File(label="Upload File", visible=False)
            text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)

    def toggle_file_input(input_type):
        return gr.update(visible=(input_type == "File")), gr.update(
            visible=(input_type == "URL"),
        ), gr.update(visible=(input_type == "Custom Text"))

    def update_lang(lang):
        return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])

    input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
    lang.change(update_lang, lang, [voice])

    with gr.Accordion("Markdown output", open=False):
        output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
    output_audio = gr.Audio(label="Generated Audio")
    submit_button = gr.Button("Convert")

    submit_button.click(
        process_input,
        inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
        outputs=[output_markdown, output_audio],
    )

demo.launch()