gsarti commited on
Commit
2efd326
·
1 Parent(s): 885609d

Added streaming

Browse files
Files changed (2) hide show
  1. app.py +63 -35
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,7 +1,10 @@
1
  import os
 
2
  import spaces
3
  import tempfile
4
- import soundfile as sf
 
 
5
  import requests
6
  from markdown import Markdown
7
  from io import StringIO
@@ -17,6 +20,31 @@ voices = {
17
  "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
18
  }
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def unmark_element(element, stream=None):
21
  if stream is None:
22
  stream = StringIO()
@@ -39,28 +67,6 @@ def markdown2text(text):
39
  return __md.convert(text)
40
 
41
 
42
- @spaces.GPU
43
- def text_to_speech(text, voice, speed, lang):
44
- try:
45
- # Generate audio
46
- samples, sample_rate = kokoro.create(
47
- text,
48
- voice=voice,
49
- speed=float(speed),
50
- lang=lang
51
- )
52
-
53
- # Create temporary file
54
- temp_dir = tempfile.mkdtemp()
55
- temp_path = os.path.join(temp_dir, "output.wav")
56
-
57
- # Save to temporary file
58
- sf.write(temp_path, samples, sample_rate)
59
- return temp_path
60
- except Exception as e:
61
- return f"Error: {str(e)}"
62
-
63
-
64
  def create_temp_html_from_url(url: str) -> str:
65
  try:
66
  response = requests.get(url)
@@ -68,7 +74,6 @@ def create_temp_html_from_url(url: str) -> str:
68
  html = response.text
69
  temp_dir = tempfile.mkdtemp()
70
  temp_path = os.path.join(temp_dir, "output.html")
71
-
72
  with open(temp_path, "w") as f:
73
  f.write(html)
74
  except Exception as e:
@@ -76,7 +81,7 @@ def create_temp_html_from_url(url: str) -> str:
76
  return temp_path
77
 
78
 
79
- def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
80
  if input_type in ["URL", "File"]:
81
  if input_type == "URL":
82
  filepath = create_temp_html_from_url(url_input)
@@ -84,17 +89,31 @@ def process_input(input_type, url_input, file_input, text_input, voice, speed, l
84
  filepath = file_input
85
  print(filepath)
86
  markdown = md.convert(filepath).text_content
87
- text = markdown2text(markdown)
88
  else:
89
  markdown = text_input
90
- text = text_input
91
- audio_path = text_to_speech(text, voice, speed, lang)
92
- return markdown, audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  with gr.Blocks() as demo:
96
  gr.Markdown(
97
- "# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
 
98
  )
99
 
100
  with gr.Row():
@@ -105,7 +124,7 @@ with gr.Blocks() as demo:
105
  lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
106
  voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
107
  with gr.Column():
108
- url_input = gr.Textbox(label="Enter URL")
109
  file_input = gr.File(label="Upload File", visible=False)
110
  text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
111
 
@@ -121,14 +140,23 @@ with gr.Blocks() as demo:
121
  lang.change(update_lang, lang, [voice])
122
 
123
  with gr.Accordion("Markdown output", open=False):
 
124
  output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
125
- output_audio = gr.Audio(label="Generated Audio")
126
  submit_button = gr.Button("Convert")
127
 
128
  submit_button.click(
129
- process_input,
130
- inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
131
- outputs=[output_markdown, output_audio],
 
 
 
 
 
 
 
 
132
  )
133
 
134
  demo.launch()
 
1
  import os
2
+ import io
3
  import spaces
4
  import tempfile
5
+ #import soundfile as sf
6
+ import numpy as np
7
+ from pydub import AudioSegment
8
  import requests
9
  from markdown import Markdown
10
  from io import StringIO
 
20
  "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
21
  }
22
 
23
+ def numpy_to_mp3(audio_array, sampling_rate):
24
+ # Normalize audio_array if it's floating-point
25
+ if np.issubdtype(audio_array.dtype, np.floating):
26
+ max_val = np.max(np.abs(audio_array))
27
+ audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
28
+ audio_array = audio_array.astype(np.int16)
29
+
30
+ # Create an audio segment from the numpy array
31
+ audio_segment = AudioSegment(
32
+ audio_array.tobytes(),
33
+ frame_rate=sampling_rate,
34
+ sample_width=audio_array.dtype.itemsize,
35
+ channels=1
36
+ )
37
+
38
+ # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
39
+ mp3_io = io.BytesIO()
40
+ audio_segment.export(mp3_io, format="mp3", bitrate="320k")
41
+
42
+ # Get the MP3 bytes
43
+ mp3_bytes = mp3_io.getvalue()
44
+ mp3_io.close()
45
+
46
+ return mp3_bytes
47
+
48
  def unmark_element(element, stream=None):
49
  if stream is None:
50
  stream = StringIO()
 
67
  return __md.convert(text)
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def create_temp_html_from_url(url: str) -> str:
71
  try:
72
  response = requests.get(url)
 
74
  html = response.text
75
  temp_dir = tempfile.mkdtemp()
76
  temp_path = os.path.join(temp_dir, "output.html")
 
77
  with open(temp_path, "w") as f:
78
  f.write(html)
79
  except Exception as e:
 
81
  return temp_path
82
 
83
 
84
+ def parse(input_type, url_input, file_input, text_input):
85
  if input_type in ["URL", "File"]:
86
  if input_type == "URL":
87
  filepath = create_temp_html_from_url(url_input)
 
89
  filepath = file_input
90
  print(filepath)
91
  markdown = md.convert(filepath).text_content
 
92
  else:
93
  markdown = text_input
94
+ return markdown
95
+
96
+
97
+ def clean(output_markdown):
98
+ return markdown2text(output_markdown)
99
+
100
+
101
+ @spaces.GPU
102
+ async def text_to_speech(output_text, voice, speed, lang):
103
+ stream = kokoro.create_stream(
104
+ output_text,
105
+ voice=voice,
106
+ speed=float(speed),
107
+ lang=lang
108
+ )
109
+ async for samples, sample_rate in stream:
110
+ yield numpy_to_mp3(samples, sampling_rate=sample_rate)
111
 
112
 
113
  with gr.Blocks() as demo:
114
  gr.Markdown(
115
+ "# Stream Local TTS with Kokoro-82M 🗣️\n"
116
+ "Provide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
117
  )
118
 
119
  with gr.Row():
 
124
  lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
125
  voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
126
  with gr.Column():
127
+ url_input = gr.Textbox(label="Enter URL", lines=1)
128
  file_input = gr.File(label="Upload File", visible=False)
129
  text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
130
 
 
140
  lang.change(update_lang, lang, [voice])
141
 
142
  with gr.Accordion("Markdown output", open=False):
143
+ output_text = gr.Textbox(visible=False)
144
  output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
145
+ output_audio = gr.Audio(label="Generated Audio", streaming=True, autoplay=True, loop=False)
146
  submit_button = gr.Button("Convert")
147
 
148
  submit_button.click(
149
+ parse,
150
+ inputs=[input_type, url_input, file_input, text_input],
151
+ outputs=[output_markdown],
152
+ ).success(
153
+ clean,
154
+ inputs=[output_markdown],
155
+ outputs=[output_text],
156
+ ).success(
157
+ text_to_speech,
158
+ inputs=[output_text, voice, speed, lang],
159
+ outputs=[output_audio],
160
  )
161
 
162
  demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  spaces
2
  gradio>=5.3.0
3
  kokoro-onnx>=0.2.7
4
- markitdown>=0.0.1a3
 
 
1
  spaces
2
  gradio>=5.3.0
3
  kokoro-onnx>=0.2.7
4
+ markitdown>=0.0.1a3
5
+ soundfile>=0.13.0