File size: 5,308 Bytes
ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 2efd326 ecc3537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import os
import io
import spaces
import tempfile
#import soundfile as sf
import numpy as np
from pydub import AudioSegment
import requests
from markdown import Markdown
from io import StringIO
import gradio as gr
from kokoro_onnx import Kokoro
from markitdown import MarkItDown
md = MarkItDown()
kokoro = Kokoro("kokoro-v0_19.onnx", "voices.json")
voices = {
"en-us": ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', 'am_michael'],
"en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
}
def numpy_to_mp3(audio_array, sampling_rate):
# Normalize audio_array if it's floating-point
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
audio_array = audio_array.astype(np.int16)
# Create an audio segment from the numpy array
audio_segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
# Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
mp3_io = io.BytesIO()
audio_segment.export(mp3_io, format="mp3", bitrate="320k")
# Get the MP3 bytes
mp3_bytes = mp3_io.getvalue()
mp3_io.close()
return mp3_bytes
def unmark_element(element, stream=None):
if stream is None:
stream = StringIO()
if element.text:
stream.write(element.text)
for sub in element:
unmark_element(sub, stream)
if element.tail:
stream.write(element.tail)
return stream.getvalue()
# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False
def markdown2text(text):
return __md.convert(text)
def create_temp_html_from_url(url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
html = response.text
temp_dir = tempfile.mkdtemp()
temp_path = os.path.join(temp_dir, "output.html")
with open(temp_path, "w") as f:
f.write(html)
except Exception as e:
raise requests.HTTPError(f"Error fetching URL: {str(e)}") from e
return temp_path
def parse(input_type, url_input, file_input, text_input):
if input_type in ["URL", "File"]:
if input_type == "URL":
filepath = create_temp_html_from_url(url_input)
else:
filepath = file_input
print(filepath)
markdown = md.convert(filepath).text_content
else:
markdown = text_input
return markdown
def clean(output_markdown):
return markdown2text(output_markdown)
@spaces.GPU
async def text_to_speech(output_text, voice, speed, lang):
stream = kokoro.create_stream(
output_text,
voice=voice,
speed=float(speed),
lang=lang
)
async for samples, sample_rate in stream:
yield numpy_to_mp3(samples, sampling_rate=sample_rate)
with gr.Blocks() as demo:
gr.Markdown(
"# Stream Local TTS with Kokoro-82M 🗣️\n"
"Provide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
)
with gr.Row():
with gr.Column():
input_type = gr.Radio(["URL", "File", "Custom Text"], label="Input Type")
with gr.Row():
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
with gr.Column():
url_input = gr.Textbox(label="Enter URL", lines=1)
file_input = gr.File(label="Upload File", visible=False)
text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
def toggle_file_input(input_type):
return gr.update(visible=(input_type == "File")), gr.update(
visible=(input_type == "URL"),
), gr.update(visible=(input_type == "Custom Text"))
def update_lang(lang):
return gr.Dropdown(choices=voices[lang], label="Voice", value=voices[lang][0])
input_type.change(toggle_file_input, input_type, [file_input, url_input, text_input])
lang.change(update_lang, lang, [voice])
with gr.Accordion("Markdown output", open=False):
output_text = gr.Textbox(visible=False)
output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
output_audio = gr.Audio(label="Generated Audio", streaming=True, autoplay=True, loop=False)
submit_button = gr.Button("Convert")
submit_button.click(
parse,
inputs=[input_type, url_input, file_input, text_input],
outputs=[output_markdown],
).success(
clean,
inputs=[output_markdown],
outputs=[output_text],
).success(
text_to_speech,
inputs=[output_text, voice, speed, lang],
outputs=[output_audio],
)
demo.launch()
|