Spaces:

gsarti
/

local-tts

Running

File size: 4,464 Bytes

ecc3537

import os
import spaces
import tempfile
import soundfile as sf
import requests
from markdown import Markdown
from io import StringIO

import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown

md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
    "en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
    "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}

def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def markdown2text(text):
    return __md.convert(text)


@spaces.GPU
def text_to_speech(text, voice, speed, lang):
    try:
        # Generate audio
        samples, sample_rate = kokoro.create(
            text,
            voice=voice,
            speed=float(speed),
            lang=lang
        )
        
        # Create temporary file
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "output.wav")
        
        # Save to temporary file
        sf.write(temp_path, samples, sample_rate)
        return temp_path
    except Exception as e:
        return f"Error: {str(e)}"


def create_temp_html_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "output.html")
        
        with open(temp_path, "w") as f:
            f.write(html)   
    except Exception as e:
        raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
    return temp_path


def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
    if input_type in ["URL", "File"]:
        if input_type == "URL":
            filepath = create_temp_html_from_url(url_input)
        else:
            filepath = file_input
        print(filepath)
        markdown = md.convert(filepath).text_content
        text = markdown2text(markdown)
    else:
        markdown = text_input
        text = text_input
    audio_path = text_to_speech(text, voice, speed, lang)
    return markdown, audio_path


with gr.Blocks() as demo:
    gr.Markdown(
        "# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
    )

    with gr.Row():
        with gr.Column():
            input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
            with gr.Row():
                speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
                lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
                voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
        with gr.Column():
            url_input = gr.Textbox(label="Enter URL")
            file_input = gr.File(label="Upload File", visible=False)
            text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)

    def toggle_file_input(input_type):
        return gr.update(visible=(input_type == "File")), gr.update(
            visible=(input_type == "URL"),
        ), gr.update(visible=(input_type == "Custom Text"))

    def update_lang(lang):
        return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])

    input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
    lang.change(update_lang, lang, [voice])

    with gr.Accordion("Markdown output", open=False):
        output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
    output_audio = gr.Audio(label="Generated Audio")
    submit_button = gr.Button("Convert")

    submit_button.click(
        process_input,
        inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
        outputs=[output_markdown, output_audio],
    )

demo.launch()