Spaces:

mozilla-ai
/

document-to-podcast

Runtime error

App Files Files Community

github-actions[bot] commited on 17 days ago

Commit

c322dc7

1 Parent(s): 5af6319

Sync with https://github.com/mozilla-ai/document-to-podcast

Browse files

Files changed (2) hide show

Dockerfile +3 -0
app.py +15 -6

Dockerfile CHANGED Viewed

@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   git \
   && apt-get clean && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
@@ -18,6 +20,7 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
 RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
 RUN pip3 install document-to-podcast
 COPY --chown=user . $HOME/app

   git \
   && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get install espeak-ng -y
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR $HOME/app
 RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
+RUN pip3 install phonemizer
 RUN pip3 install document-to-podcast
 COPY --chown=user . $HOME/app

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """Streamlit app for converting documents to podcasts."""
 import re
 from pathlib import Path
-import io
 import numpy as np
 import soundfile as sf
@@ -22,13 +23,16 @@ from document_to_podcast.utils import stack_audio_segments
 @st.cache_resource
 def load_text_to_text_model():
     return load_llama_cpp_model(
-        model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
     )
 @st.cache_resource
 def load_text_to_speech_model():
-    return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,10 +119,15 @@ if "clean_text" in st.session_state:
     text_model = load_text_to_text_model()
     speech_model = load_text_to_speech_model()
     st.markdown(
         "For this demo, we are using the following models: \n"
-        "- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
-        "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
     )
     st.markdown(
         "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
@@ -187,7 +196,7 @@ if "clean_text" in st.session_state:
     if st.session_state[gen_button]:
         audio_np = stack_audio_segments(
-            st.session_state.audio, speech_model.sample_rate
         )
         audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
         if st.download_button(

 """Streamlit app for converting documents to podcasts."""
+import io
+import os
 import re
 from pathlib import Path
 import numpy as np
 import soundfile as sf
 @st.cache_resource
 def load_text_to_text_model():
     return load_llama_cpp_model(
+        model_id="bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
     )
 @st.cache_resource
 def load_text_to_speech_model():
+    if os.environ.get("HF_SPACE") == "TRUE":
+        return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
+    else:
+        return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
     text_model = load_text_to_text_model()
     speech_model = load_text_to_speech_model()
+    if os.environ.get("HF_SPACE") == "TRUE":
+        tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
+    else:
+        tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
     st.markdown(
         "For this demo, we are using the following models: \n"
+        "- [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
+        f"{tts_link}\n"
     )
     st.markdown(
         "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
     if st.session_state[gen_button]:
         audio_np = stack_audio_segments(
+            st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
         )
         audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
         if st.download_button(