File size: 4,464 Bytes
ecc3537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import spaces
import tempfile
import soundfile as sf
import requests
from markdown import Markdown
from io import StringIO
import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown
md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
"en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
"en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}
def unmark_element(element, stream=None):
if stream is None:
stream = StringIO()
if element.text:
stream.write(element.text)
for sub in element:
unmark_element(sub, stream)
if element.tail:
stream.write(element.tail)
return stream.getvalue()
# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False
def markdown2text(text):
return __md.convert(text)
@spaces.GPU
def text_to_speech(text, voice, speed, lang):
try:
# Generate audio
samples, sample_rate = kokoro.create(
text,
voice=voice,
speed=float(speed),
lang=lang
)
# Create temporary file
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, "output.wav")
# Save to temporary file
sf.write(temp_path, samples, sample_rate)
return temp_path
except Exception as e:
return f"Error: {str(e)}"
def create_temp_html_from_url(url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
html = response.text
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, "output.html")
with open(temp_path, "w") as f:
f.write(html)
except Exception as e:
raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
return temp_path
def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
if input_type in ["URL", "File"]:
if input_type == "URL":
filepath = create_temp_html_from_url(url_input)
else:
filepath = file_input
print(filepath)
markdown = md.convert(filepath).text_content
text = markdown2text(markdown)
else:
markdown = text_input
text = text_input
audio_path = text_to_speech(text, voice, speed, lang)
return markdown, audio_path
with gr.Blocks() as demo:
gr.Markdown(
"# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
)
with gr.Row():
with gr.Column():
input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
with gr.Row():
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
with gr.Column():
url_input = gr.Textbox(label="Enter URL")
file_input = gr.File(label="Upload File", visible=False)
text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
def toggle_file_input(input_type):
return gr.update(visible=(input_type == "File")), gr.update(
visible=(input_type == "URL"),
), gr.update(visible=(input_type == "Custom Text"))
def update_lang(lang):
return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])
input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
lang.change(update_lang, lang, [voice])
with gr.Accordion("Markdown output", open=False):
output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
output_audio = gr.Audio(label="Generated Audio")
submit_button = gr.Button("Convert")
submit_button.click(
process_input,
inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
outputs=[output_markdown, output_audio],
)
demo.launch()
|