File size: 7,843 Bytes
9d0299c
 
c322dc7
 
9d0299c
 
 
5af6319
9d0299c
 
 
9ae4486
9d0299c
 
 
9ae4486
9d0299c
 
 
6b39985
9d0299c
 
 
 
 
c322dc7
9d0299c
 
 
 
 
c322dc7
 
 
 
9d0299c
 
5af6319
 
 
 
 
 
 
 
 
 
9d0299c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4a7f53
 
 
 
9d0299c
 
 
 
 
 
 
a4a7f53
 
 
 
 
 
9d0299c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c322dc7
 
f63a450
 
 
 
 
 
 
 
 
 
 
 
 
 
c322dc7
 
f63a450
c322dc7
9d0299c
 
c322dc7
 
9d0299c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f63a450
9d0299c
f63a450
9d0299c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ae4486
 
9d0299c
 
5af6319
9d0299c
 
5af6319
c322dc7
5af6319
 
 
 
 
 
 
9d0299c
 
5af6319
 
 
 
 
9d0299c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""Streamlit app for converting documents to podcasts."""

import io
import os
import re
from pathlib import Path

import numpy as np
import soundfile as sf
import streamlit as st

from document_to_podcast.inference.text_to_speech import text_to_speech
from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
    load_llama_cpp_model,
    load_tts_model,
)
from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
from document_to_podcast.inference.text_to_text import text_to_text_stream
from document_to_podcast.utils import stack_audio_segments


@st.cache_resource
def load_text_to_text_model():
    return load_llama_cpp_model(
        model_id="bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
    )


@st.cache_resource
def load_text_to_speech_model():
    if os.environ.get("HF_SPACE") == "TRUE":
        return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
    else:
        return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")


def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
    """
    Convert a numpy array to audio bytes in .wav format, ready to save into a file.
    """
    wav_io = io.BytesIO()
    sf.write(wav_io, audio_array, sample_rate, format="WAV")
    wav_io.seek(0)
    return wav_io


script = "script"
audio = "audio"
gen_button = "generate podcast button"
if script not in st.session_state:
    st.session_state[script] = ""
if audio not in st.session_state:
    st.session_state.audio = []
if gen_button not in st.session_state:
    st.session_state[gen_button] = False


def gen_button_clicked():
    st.session_state[gen_button] = True


st.title("Document To Podcast")

st.header("Upload a File")

uploaded_file = st.file_uploader(
    "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)

st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")

if uploaded_file is not None or url:
    st.divider()
    st.header("Loading and Cleaning Data")
    st.markdown(
        "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
    )
    st.divider()

    if uploaded_file:
        extension = Path(uploaded_file.name).suffix
        raw_text = DATA_LOADERS[extension](uploaded_file)
    else:
        extension = ".html"
        raw_text = DATA_LOADERS["url"](url)

    col1, col2 = st.columns(2)

    with col1:
        st.subheader("Raw Text")
        st.text_area(
            f"Number of characters before cleaning: {len(raw_text)}",
            f"{raw_text[:500]} . . .",
        )

    clean_text = DATA_CLEANERS[extension](raw_text)
    with col2:
        st.subheader("Cleaned Text")
        st.text_area(
            f"Number of characters after cleaning: {len(clean_text)}",
            f"{clean_text[:500]} . . .",
        )
    st.session_state["clean_text"] = clean_text

st.divider()

if "clean_text" in st.session_state:
    clean_text = st.session_state["clean_text"]

    st.divider()
    st.header("Downloading and Loading models")
    st.markdown(
        "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
    )
    st.divider()

    text_model = load_text_to_text_model()
    speech_model = load_text_to_speech_model()

    if os.environ.get("HF_SPACE") == "TRUE":
        tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co./hexgrad/Kokoro-82M)"
        SPEAKERS = [
            {
                "id": 1,
                "name": "Sarah",
                "description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
                "voice_profile": "af_sarah",
            },
            {
                "id": 2,
                "name": "Michael",
                "description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
                "voice_profile": "am_michael",
            },
        ]
    else:
        tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co./OuteAI/OuteTTS-0.2-500M-GGUF)"
        SPEARES = DEFAULT_SPEAKERS

    st.markdown(
        "For this demo, we are using the following models: \n"
        "- [Qwen2.5-3B-Instruct](https://huggingface.co./bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
        f"{tts_link}\n"
    )
    st.markdown(
        "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
        " for more information on how to use different models."
    )

    # ~4 characters per token is considered a reasonable default.
    max_characters = text_model.n_ctx() * 4
    if len(clean_text) > max_characters:
        st.warning(
            f"Input text is too big ({len(clean_text)})."
            f" Using only a subset of it ({max_characters})."
        )
        clean_text = clean_text[:max_characters]

    st.divider()
    st.header("Podcast generation")
    st.markdown(
        "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
    )
    st.divider()

    st.subheader("Speaker configuration")
    for s in SPEAKERS:
        s.pop("id", None)
    speakers = st.data_editor(SPEAKERS, num_rows="dynamic")

    if st.button("Generate Podcast", on_click=gen_button_clicked):
        for n, speaker in enumerate(speakers):
            speaker["id"] = n + 1
        speakers_str = "\n".join(
            str(Speaker.model_validate(speaker))
            for speaker in speakers
            if all(
                speaker.get(x, None) for x in ["name", "description", "voice_profile"]
            )
        )
        system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
        with st.spinner("Generating Podcast..."):
            text = ""
            for chunk in text_to_text_stream(
                clean_text, text_model, system_prompt=system_prompt.strip()
            ):
                text += chunk
                if text.endswith("\n") and "Speaker" in text:
                    st.session_state.script += text
                    st.write(text)

                    speaker_id = re.search(r"Speaker (\d+)", text).group(1)
                    voice_profile = next(
                        speaker["voice_profile"]
                        for speaker in speakers
                        if speaker["id"] == int(speaker_id)
                    )
                    with st.spinner("Generating Audio..."):
                        speech = text_to_speech(
                            text.split(f'"Speaker {speaker_id}":')[-1],
                            speech_model,
                            voice_profile,
                        )
                    st.audio(speech, sample_rate=speech_model.sample_rate)

                    st.session_state.audio.append(speech)
                    text = ""
        st.session_state.script += "}"

    if st.session_state[gen_button]:
        audio_np = stack_audio_segments(
            st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
        )
        audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
        if st.download_button(
            label="Save Podcast to audio file",
            data=audio_wav,
            file_name="podcast.wav",
        ):
            st.markdown("Podcast saved to disk!")

        if st.download_button(
            label="Save Podcast script to text file",
            data=st.session_state.script,
            file_name="script.txt",
        ):
            st.markdown("Script saved to disk!")