|
import os |
|
import spaces |
|
import tempfile |
|
import soundfile as sf |
|
import requests |
|
from markdown import Markdown |
|
from io import StringIO |
|
|
|
import gradio as gr |
|
from kokoro_onnx import Kokoro |
|
from markitdown import MarkItDown |
|
|
|
md = MarkItDown() |
|
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json") |
|
voices = { |
|
"en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'], |
|
"en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis'] |
|
} |
|
|
|
def unmark_element(element, stream=None): |
|
if stream is None: |
|
stream = StringIO() |
|
if element.text: |
|
stream.write(element.text) |
|
for sub in element: |
|
unmark_element(sub, stream) |
|
if element.tail: |
|
stream.write(element.tail) |
|
return stream.getvalue() |
|
|
|
|
|
|
|
Markdown.output_formats["plain"] = unmark_element |
|
__md = Markdown(output_format="plain") |
|
__md.stripTopLevelTags = False |
|
|
|
|
|
def markdown2text(text): |
|
return __md.convert(text) |
|
|
|
|
|
@spaces.GPU |
|
def text_to_speech(text, voice, speed, lang): |
|
try: |
|
|
|
samples, sample_rate = kokoro.create( |
|
text, |
|
voice=voice, |
|
speed=float(speed), |
|
lang=lang |
|
) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
temp_path = os.path.join(temp_dir, "output.wav") |
|
|
|
|
|
sf.write(temp_path, samples, sample_rate) |
|
return temp_path |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
def create_temp_html_from_url(url: str) -> str: |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
html = response.text |
|
temp_dir = tempfile.mkdtemp() |
|
temp_path = os.path.join(temp_dir, "output.html") |
|
|
|
with open(temp_path, "w") as f: |
|
f.write(html) |
|
except Exception as e: |
|
raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e |
|
return temp_path |
|
|
|
|
|
def process_input(input_type, url_input, file_input, text_input, voice, speed, lang): |
|
if input_type in ["URL", "File"]: |
|
if input_type == "URL": |
|
filepath = create_temp_html_from_url(url_input) |
|
else: |
|
filepath = file_input |
|
print(filepath) |
|
markdown = md.convert(filepath).text_content |
|
text = markdown2text(markdown) |
|
else: |
|
markdown = text_input |
|
text = text_input |
|
audio_path = text_to_speech(text, voice, speed, lang) |
|
return markdown, audio_path |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
"# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)." |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type") |
|
with gr.Row(): |
|
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed") |
|
lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us") |
|
voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0]) |
|
with gr.Column(): |
|
url_input = gr.Textbox(label="Enter URL") |
|
file_input = gr.File(label="Upload File", visible=False) |
|
text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True) |
|
|
|
def toggle_file_input(input_type): |
|
return gr.update(visible=(input_type == "File")), gr.update( |
|
visible=(input_type == "URL"), |
|
), gr.update(visible=(input_type == "Custom Text")) |
|
|
|
def update_lang(lang): |
|
return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0]) |
|
|
|
input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input]) |
|
lang.change(update_lang, lang, [voice]) |
|
|
|
with gr.Accordion("Markdown output", open=False): |
|
output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True) |
|
output_audio = gr.Audio(label="Generated Audio") |
|
submit_button = gr.Button("Convert") |
|
|
|
submit_button.click( |
|
process_input, |
|
inputs=[input_type, url_input, file_input, text_input, voice, speed, lang], |
|
outputs=[output_markdown, output_audio], |
|
) |
|
|
|
demo.launch() |
|
|