github-actions[bot] commited on
Commit
c322dc7
·
1 Parent(s): 5af6319

Sync with https://github.com/mozilla-ai/document-to-podcast

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -0
  2. app.py +15 -6
Dockerfile CHANGED
@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
8
  git \
9
  && apt-get clean && rm -rf /var/lib/apt/lists/*
10
 
 
 
11
  RUN useradd -m -u 1000 user
12
 
13
  USER user
@@ -18,6 +20,7 @@ ENV HOME=/home/user \
18
  WORKDIR $HOME/app
19
 
20
  RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
 
21
  RUN pip3 install document-to-podcast
22
 
23
  COPY --chown=user . $HOME/app
 
8
  git \
9
  && apt-get clean && rm -rf /var/lib/apt/lists/*
10
 
11
+ RUN apt-get install espeak-ng -y
12
+
13
  RUN useradd -m -u 1000 user
14
 
15
  USER user
 
20
  WORKDIR $HOME/app
21
 
22
  RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
23
+ RUN pip3 install phonemizer
24
  RUN pip3 install document-to-podcast
25
 
26
  COPY --chown=user . $HOME/app
app.py CHANGED
@@ -1,8 +1,9 @@
1
  """Streamlit app for converting documents to podcasts."""
2
 
 
 
3
  import re
4
  from pathlib import Path
5
- import io
6
 
7
  import numpy as np
8
  import soundfile as sf
@@ -22,13 +23,16 @@ from document_to_podcast.utils import stack_audio_segments
22
  @st.cache_resource
23
  def load_text_to_text_model():
24
  return load_llama_cpp_model(
25
- model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
26
  )
27
 
28
 
29
  @st.cache_resource
30
  def load_text_to_speech_model():
31
- return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
 
 
32
 
33
 
34
  def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,10 +119,15 @@ if "clean_text" in st.session_state:
115
  text_model = load_text_to_text_model()
116
  speech_model = load_text_to_speech_model()
117
 
 
 
 
 
 
118
  st.markdown(
119
  "For this demo, we are using the following models: \n"
120
- "- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
121
- "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
122
  )
123
  st.markdown(
124
  "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
@@ -187,7 +196,7 @@ if "clean_text" in st.session_state:
187
 
188
  if st.session_state[gen_button]:
189
  audio_np = stack_audio_segments(
190
- st.session_state.audio, speech_model.sample_rate
191
  )
192
  audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
193
  if st.download_button(
 
1
  """Streamlit app for converting documents to podcasts."""
2
 
3
+ import io
4
+ import os
5
  import re
6
  from pathlib import Path
 
7
 
8
  import numpy as np
9
  import soundfile as sf
 
23
  @st.cache_resource
24
  def load_text_to_text_model():
25
  return load_llama_cpp_model(
26
+ model_id="bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
27
  )
28
 
29
 
30
  @st.cache_resource
31
  def load_text_to_speech_model():
32
+ if os.environ.get("HF_SPACE") == "TRUE":
33
+ return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
34
+ else:
35
+ return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
36
 
37
 
38
  def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
 
119
  text_model = load_text_to_text_model()
120
  speech_model = load_text_to_speech_model()
121
 
122
+ if os.environ.get("HF_SPACE") == "TRUE":
123
+ tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
124
+ else:
125
+ tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
126
+
127
  st.markdown(
128
  "For this demo, we are using the following models: \n"
129
+ "- [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
130
+ f"{tts_link}\n"
131
  )
132
  st.markdown(
133
  "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
 
196
 
197
  if st.session_state[gen_button]:
198
  audio_np = stack_audio_segments(
199
+ st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
200
  )
201
  audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
202
  if st.download_button(