Spaces:
Running
on
T4
Running
on
T4
File size: 7,843 Bytes
9d0299c c322dc7 9d0299c 5af6319 9d0299c 9ae4486 9d0299c 9ae4486 9d0299c 6b39985 9d0299c c322dc7 9d0299c c322dc7 9d0299c 5af6319 9d0299c a4a7f53 9d0299c a4a7f53 9d0299c c322dc7 f63a450 c322dc7 f63a450 c322dc7 9d0299c c322dc7 9d0299c f63a450 9d0299c f63a450 9d0299c 9ae4486 9d0299c 5af6319 9d0299c 5af6319 c322dc7 5af6319 9d0299c 5af6319 9d0299c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
"""Streamlit app for converting documents to podcasts."""
import io
import os
import re
from pathlib import Path
import numpy as np
import soundfile as sf
import streamlit as st
from document_to_podcast.inference.text_to_speech import text_to_speech
from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
load_llama_cpp_model,
load_tts_model,
)
from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
from document_to_podcast.inference.text_to_text import text_to_text_stream
from document_to_podcast.utils import stack_audio_segments
@st.cache_resource
def load_text_to_text_model():
return load_llama_cpp_model(
model_id="bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
)
@st.cache_resource
def load_text_to_speech_model():
if os.environ.get("HF_SPACE") == "TRUE":
return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
else:
return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
"""
Convert a numpy array to audio bytes in .wav format, ready to save into a file.
"""
wav_io = io.BytesIO()
sf.write(wav_io, audio_array, sample_rate, format="WAV")
wav_io.seek(0)
return wav_io
script = "script"
audio = "audio"
gen_button = "generate podcast button"
if script not in st.session_state:
st.session_state[script] = ""
if audio not in st.session_state:
st.session_state.audio = []
if gen_button not in st.session_state:
st.session_state[gen_button] = False
def gen_button_clicked():
st.session_state[gen_button] = True
st.title("Document To Podcast")
st.header("Upload a File")
uploaded_file = st.file_uploader(
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)
st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
if uploaded_file is not None or url:
st.divider()
st.header("Loading and Cleaning Data")
st.markdown(
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
)
st.divider()
if uploaded_file:
extension = Path(uploaded_file.name).suffix
raw_text = DATA_LOADERS[extension](uploaded_file)
else:
extension = ".html"
raw_text = DATA_LOADERS["url"](url)
col1, col2 = st.columns(2)
with col1:
st.subheader("Raw Text")
st.text_area(
f"Number of characters before cleaning: {len(raw_text)}",
f"{raw_text[:500]} . . .",
)
clean_text = DATA_CLEANERS[extension](raw_text)
with col2:
st.subheader("Cleaned Text")
st.text_area(
f"Number of characters after cleaning: {len(clean_text)}",
f"{clean_text[:500]} . . .",
)
st.session_state["clean_text"] = clean_text
st.divider()
if "clean_text" in st.session_state:
clean_text = st.session_state["clean_text"]
st.divider()
st.header("Downloading and Loading models")
st.markdown(
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
)
st.divider()
text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()
if os.environ.get("HF_SPACE") == "TRUE":
tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co./hexgrad/Kokoro-82M)"
SPEAKERS = [
{
"id": 1,
"name": "Sarah",
"description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
"voice_profile": "af_sarah",
},
{
"id": 2,
"name": "Michael",
"description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
"voice_profile": "am_michael",
},
]
else:
tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co./OuteAI/OuteTTS-0.2-500M-GGUF)"
SPEARES = DEFAULT_SPEAKERS
st.markdown(
"For this demo, we are using the following models: \n"
"- [Qwen2.5-3B-Instruct](https://huggingface.co./bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
f"{tts_link}\n"
)
st.markdown(
"You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
" for more information on how to use different models."
)
# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
st.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]
st.divider()
st.header("Podcast generation")
st.markdown(
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
)
st.divider()
st.subheader("Speaker configuration")
for s in SPEAKERS:
s.pop("id", None)
speakers = st.data_editor(SPEAKERS, num_rows="dynamic")
if st.button("Generate Podcast", on_click=gen_button_clicked):
for n, speaker in enumerate(speakers):
speaker["id"] = n + 1
speakers_str = "\n".join(
str(Speaker.model_validate(speaker))
for speaker in speakers
if all(
speaker.get(x, None) for x in ["name", "description", "voice_profile"]
)
)
system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
with st.spinner("Generating Podcast..."):
text = ""
for chunk in text_to_text_stream(
clean_text, text_model, system_prompt=system_prompt.strip()
):
text += chunk
if text.endswith("\n") and "Speaker" in text:
st.session_state.script += text
st.write(text)
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
voice_profile = next(
speaker["voice_profile"]
for speaker in speakers
if speaker["id"] == int(speaker_id)
)
with st.spinner("Generating Audio..."):
speech = text_to_speech(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
voice_profile,
)
st.audio(speech, sample_rate=speech_model.sample_rate)
st.session_state.audio.append(speech)
text = ""
st.session_state.script += "}"
if st.session_state[gen_button]:
audio_np = stack_audio_segments(
st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
)
audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
if st.download_button(
label="Save Podcast to audio file",
data=audio_wav,
file_name="podcast.wav",
):
st.markdown("Podcast saved to disk!")
if st.download_button(
label="Save Podcast script to text file",
data=st.session_state.script,
file_name="script.txt",
):
st.markdown("Script saved to disk!")
|