local-tts / app.py
gsarti's picture
Add to LFS
ecc3537
raw
history blame
4.46 kB
import os
import spaces
import tempfile
import soundfile as sf
import requests
from markdown import Markdown
from io import StringIO
import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown
md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
"en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
"en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}
def unmark_element(element, stream=None):
if stream is None:
stream = StringIO()
if element.text:
stream.write(element.text)
for sub in element:
unmark_element(sub, stream)
if element.tail:
stream.write(element.tail)
return stream.getvalue()
# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False
def markdown2text(text):
return __md.convert(text)
@spaces.GPU
def text_to_speech(text, voice, speed, lang):
try:
# Generate audio
samples, sample_rate = kokoro.create(
text,
voice=voice,
speed=float(speed),
lang=lang
)
# Create temporary file
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, "output.wav")
# Save to temporary file
sf.write(temp_path, samples, sample_rate)
return temp_path
except Exception as e:
return f"Error: {str(e)}"
def create_temp_html_from_url(url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
html = response.text
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, "output.html")
with open(temp_path, "w") as f:
f.write(html)
except Exception as e:
raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
return temp_path
def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
if input_type in ["URL", "File"]:
if input_type == "URL":
filepath = create_temp_html_from_url(url_input)
else:
filepath = file_input
print(filepath)
markdown = md.convert(filepath).text_content
text = markdown2text(markdown)
else:
markdown = text_input
text = text_input
audio_path = text_to_speech(text, voice, speed, lang)
return markdown, audio_path
with gr.Blocks() as demo:
gr.Markdown(
"# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
)
with gr.Row():
with gr.Column():
input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
with gr.Row():
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
with gr.Column():
url_input = gr.Textbox(label="Enter URL")
file_input = gr.File(label="Upload File", visible=False)
text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
def toggle_file_input(input_type):
return gr.update(visible=(input_type == "File")), gr.update(
visible=(input_type == "URL"),
), gr.update(visible=(input_type == "Custom Text"))
def update_lang(lang):
return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])
input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
lang.change(update_lang, lang, [voice])
with gr.Accordion("Markdown output", open=False):
output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
output_audio = gr.Audio(label="Generated Audio")
submit_button = gr.Button("Convert")
submit_button.click(
process_input,
inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
outputs=[output_markdown, output_audio],
)
demo.launch()