Spaces:

ales
/

ai-audio-books

Running

App Files Files Community

Aliaksandr

karapuz Maksim Liutisch commited on Nov 7, 2024

Commit

f655f69

unverified ·

1 Parent(s): e59e248

merge dev into main (#13)

Browse files

* add description to readme

* sound effects v2 (#15)

* use TTS with timestamps
* ask LLM to place sound effects in arbitrary place in the text
* lots of refactoring

* Improve sound effect prompt (#21)

Improve sound effects generation:
* upd prompt
* add fade-in and fade-out
* make effects quiter
* make effects at least 1 second long
* bugfix in effects regex

* Feature/emotions checking (#23)

- udpate text preprocessing for TTS
- add left and right text contexts to TTS model
- upd TTS params selection
- upd effects prompt
- upd OPENAI_MAX_PARALLEL

* Visualization (#22)

update visualizations

---------

Co-authored-by: Skidan Olya <[email protected]>
Co-authored-by: Maksim Liutisch <[email protected]>

Files changed (34) hide show

.gitignore +1 -2
README.md +3 -17
app.py +56 -34
data/11labs_available_tts_voices.reviewed.csv +1 -1
reviewed_voices.xlsx → data/reviewed_voices.xlsx +0 -0
voices_to_consider.xlsx → data/voices_to_consider.xlsx +0 -0
makefile +8 -0
pg.ipynb → notebooks/eda_voices.ipynb +18 -467
filter_voices.ipynb → notebooks/filter_voices.ipynb +9 -0
notebooks/playground.ipynb +0 -0
pyproject.toml +8 -0
requirements.txt +3 -1
scripts/add_voices.py +3 -8
scripts/export_available_voices.py +1 -4
src/audio_generators.py +0 -315
src/builder.py +567 -21
src/config.py +14 -4
src/emotions/generation.py +0 -208
src/emotions/prompts.py +0 -160
src/emotions/utils.py +0 -75
src/generate_emotional_voice.py +15 -21
src/lc_callbacks.py +3 -7
src/preprocess_tts_emotions_chain.py +73 -0
src/prompts.py +236 -89
src/schemas.py +234 -0
src/select_voice_chain.py +7 -20
src/sound_effects_design.py +99 -0
src/text_modification_chain.py +34 -0
src/text_split_chain.py +4 -56
src/tts.py +39 -31
src/utils.py +146 -8
src/web/constructor.py +45 -0
src/web/utils.py +345 -0
src/web/variables.py +517 -0

.gitignore CHANGED Viewed

@@ -4,7 +4,6 @@ venv
 .python-version
 .DS_Store
-data/books
-data/audiobooks
 .env

 .python-version
 .DS_Store
+data/**/
 .env

README.md CHANGED Viewed

@@ -10,22 +10,8 @@ pinned: false
 python_version: 3.11
 ---
-### Action Items / Ideas
-- intonations
-    - add context
-- audio effects
-    - add context
-    - filter, apply only for long phrases
-- improve UI
-    - show character parts
-- testing
-    - eval current execution time
-- optimizations
-    - combine sequential phrases of same character in single phrase
-    - support large texts. use batching. problem: how to ensure same characters?
-    - can detect characters in first prompt, then split text in each batch into character phrases
-        - probably split large phrases into smaller ones
-        - identify unknown characters
-        - use LLM to recognize characters for a given text and provide descriptions detailed enough to select appropriate voice

 python_version: 3.11
 ---
+## Description
+Automatically generate audiobooks from the text input. Automatically detect characters and map them to appropriate voices. Use text-to-speech models combined with text-to-audio-effect models to create an immersive listening experience
+This project focuses on the automatic generation of audiobooks from text input, offering an immersive experience. The system intelligently detects characters in the text and assigns them distinct, appropriate voices using Large Language Model. To enhance the auditory experience, the project incorporates text-to-audio-effect models, adding relevant background sounds and audio effects that match the context of the narrative. The combination of natural-sounding speech synthesis and environmental sound design creates a rich, engaging audiobook experience that adapts seamlessly to different genres and styles of writing, making the storytelling more vivid and captivating for listeners.

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-from typing import List
 import gradio as gr
 from dotenv import load_dotenv
@@ -8,9 +7,11 @@ from langchain_community.document_loaders import PyPDFLoader
 load_dotenv()
-from src.builder import AudiobookBuilder
-from src.config import logger, FILE_SIZE_MAX, MAX_TEXT_LEN, DESCRIPTION
 from data import samples_to_split as samples
 def get_auth_params():
@@ -27,13 +28,10 @@ def parse_pdf(file_path):
 def load_text_from_file(uploaded_file):
-    # Save the uploaded file temporarily to check its size
     temp_file_path = uploaded_file.name
     if os.path.getsize(temp_file_path) > FILE_SIZE_MAX * 1024 * 1024:
-        raise ValueError(
-            f"The uploaded file exceeds the size limit of {FILE_SIZE_MAX} MB."
-        )
     if uploaded_file.name.endswith(".txt"):
         with open(temp_file_path, "r", encoding="utf-8") as file:
@@ -46,45 +44,58 @@ def load_text_from_file(uploaded_file):
     return text
-async def respond(
     text: str,
     uploaded_file,
     generate_effects: bool,
-) -> tuple[Path | None, str]:
     if uploaded_file is not None:
         try:
             text = load_text_from_file(uploaded_file=uploaded_file)
         except Exception as e:
             logger.exception(e)
-            return (None, str(e))
     if (text_len := len(text)) > MAX_TEXT_LEN:
-        gr.Warning(
             f"Input text length of {text_len} characters "
             f"exceeded current limit of {MAX_TEXT_LEN} characters. "
             "Please input a shorter text."
         )
-        return None, ""
-    builder = AudiobookBuilder()
-    audio_fp = await builder.run(text=text, generate_effects=generate_effects)
-    return audio_fp, ""
 def refresh():
-    return None, None, None  # Reset audio output, error message, and uploaded file
-with gr.Blocks(title="Audiobooks Generation") as ui:
-    gr.Markdown(DESCRIPTION)
     with gr.Row(variant="panel"):
         text_input = gr.Textbox(label="Enter the book text here", lines=15)
         file_input = gr.File(
             label="Upload a text file or PDF",
             file_types=[".txt", ".pdf"],
-            visible=False,
         )
     examples = gr.Examples(
@@ -104,33 +115,49 @@ with gr.Blocks(title="Audiobooks Generation") as ui:
         ],
     )
-    audio_output = gr.Audio(
-        label='Generated audio. Please wait for the waveform to appear, before hitting "Play"',
-        type="filepath",
-    )
-    # error output is hidden initially
     error_output = gr.Textbox(label="Error Message", interactive=False, visible=False)
     effects_generation_checkbox = gr.Checkbox(
-        label="Add background effects",
         value=False,
         info="Select if you want to add occasional sound effect to the audiobook",
     )
     with gr.Row(variant="panel"):
-        submit_button = gr.Button("Generate the audiobook", variant="primary")
         refresh_button = gr.Button("Refresh", variant="secondary")
     submit_button.click(
-        fn=respond,
         inputs=[
             text_input,
             file_input,
             effects_generation_checkbox,
         ],  # Include the uploaded file as an input
         outputs=[
             audio_output,
             error_output,
         ],  # Include the audio output and error message output
     )
     refresh_button.click(
@@ -142,21 +169,16 @@ with gr.Blocks(title="Audiobooks Generation") as ui:
             file_input,
         ],  # Reset audio output, error message, and uploaded file
     )
-    # Hide error message dynamically when input is received
     text_input.change(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[text_input],
         outputs=error_output,
     )
     file_input.change(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[file_input],
         outputs=error_output,
     )
-    # To clear error field when refreshing
     refresh_button.click(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[],

 import os
 from pathlib import Path
 import gradio as gr
 from dotenv import load_dotenv
 load_dotenv()
 from data import samples_to_split as samples
+from src.builder import AudiobookBuilder
+from src.config import FILE_SIZE_MAX, MAX_TEXT_LEN, logger
+from src.web.utils import create_status_html
+from src.web.variables import DESCRIPTION_JS, GRADIO_THEME, STATUS_DISPLAY_HTML, VOICE_UPLOAD_JS
 def get_auth_params():
 def load_text_from_file(uploaded_file):
     temp_file_path = uploaded_file.name
     if os.path.getsize(temp_file_path) > FILE_SIZE_MAX * 1024 * 1024:
+        raise ValueError(f"The uploaded file exceeds the size limit of {FILE_SIZE_MAX} MB.")
     if uploaded_file.name.endswith(".txt"):
         with open(temp_file_path, "r", encoding="utf-8") as file:
     return text
+async def audiobook_builder(
     text: str,
     uploaded_file,
     generate_effects: bool,
+    use_user_voice: bool,
+    voice_id: str | None = None,
+):
+    builder = AudiobookBuilder()
     if uploaded_file is not None:
         try:
             text = load_text_from_file(uploaded_file=uploaded_file)
         except Exception as e:
             logger.exception(e)
+            msg = "Failed to load text from the provided document"
+            gr.Warning(msg)
+            yield None, str(e), builder.html_generator.generate_error(msg)
+            return
+    if not text:
+        logger.info(f"No text was passed. can't generate an audiobook")
+        msg = 'Please provide the text to generate audiobook from'
+        gr.Warning(msg)
+        yield None, "", builder.html_generator.generate_error(msg)
+        return
     if (text_len := len(text)) > MAX_TEXT_LEN:
+        msg = (
             f"Input text length of {text_len} characters "
             f"exceeded current limit of {MAX_TEXT_LEN} characters. "
             "Please input a shorter text."
         )
+        logger.info(msg)
+        gr.Warning(msg)
+        yield None, "", builder.html_generator.generate_error(msg)
+        return
+    async for stage in builder.run(text, generate_effects, use_user_voice, voice_id):
+        yield stage
 def refresh():
+    return None, None, None, STATUS_DISPLAY_HTML
+with gr.Blocks(js=DESCRIPTION_JS, theme=GRADIO_THEME) as ui:
     with gr.Row(variant="panel"):
         text_input = gr.Textbox(label="Enter the book text here", lines=15)
         file_input = gr.File(
             label="Upload a text file or PDF",
             file_types=[".txt", ".pdf"],
+            visible=True,
         )
     examples = gr.Examples(
         ],
     )
     error_output = gr.Textbox(label="Error Message", interactive=False, visible=False)
     effects_generation_checkbox = gr.Checkbox(
+        label="Add sound effects",
         value=False,
         info="Select if you want to add occasional sound effect to the audiobook",
     )
+    use_voice_checkbox = gr.Checkbox(
+        label="Use my voice",
+        value=False,
+        info="Select if you want to use your voice for whole or part of the audiobook (Generations may take longer than usual)",
+    )
+    submit_button = gr.Button("Generate the audiobook", variant="primary")
     with gr.Row(variant="panel"):
+        add_voice_btn = gr.Button("Add my voice", variant="primary")
         refresh_button = gr.Button("Refresh", variant="secondary")
+    voice_result = gr.Textbox(visible=False, interactive=False, label="Processed Result")
+    status_display = gr.HTML(value=STATUS_DISPLAY_HTML, label="Generation Status")
+    audio_output = gr.Audio(
+        label='Generated audio. Please wait for the waveform to appear, before hitting "Play"',
+        type="filepath",
+    )
+    # callbacks
+    add_voice_btn.click(fn=None, inputs=None, outputs=voice_result, js=VOICE_UPLOAD_JS)
     submit_button.click(
+        fn=audiobook_builder,
         inputs=[
             text_input,
             file_input,
             effects_generation_checkbox,
+            use_voice_checkbox,
+            voice_result,
         ],  # Include the uploaded file as an input
         outputs=[
             audio_output,
             error_output,
+            status_display,
         ],  # Include the audio output and error message output
     )
     refresh_button.click(
             file_input,
         ],  # Reset audio output, error message, and uploaded file
     )
     text_input.change(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[text_input],
         outputs=error_output,
     )
     file_input.change(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[file_input],
         outputs=error_output,
     )
     refresh_button.click(
         fn=lambda _: gr.update(visible=False),  # Hide the error field
         inputs=[],

data/11labs_available_tts_voices.reviewed.csv CHANGED Viewed

@@ -19,7 +19,7 @@ teAOBFSeynXfbyNgq6Ec,Ally - Curious and Chill,https://storage.googleapis.com/ele
 IKne3meq5aSn9XLyUdCD,Charlie,https://storage.googleapis.com/eleven-public-prod/premade/voices/IKne3meq5aSn9XLyUdCD/102de6f2-22ed-43e0-a1f1-111fa75c5481.mp3,ok,,,FALSE,FALSE,australian,natural,middle_aged,male,conversational,,
 cjVigY5qzO86Huf0OWal,Eric,https://storage.googleapis.com/eleven-public-prod/premade/voices/cjVigY5qzO86Huf0OWal/d098fda0-6456-4030-b3d8-63aa048c9070.mp3,medium,,,FALSE,FALSE,american,friendly,middle_aged,male,conversational,,
 BFUk567oZITYKwOqegEq,Riley - loud and intense,https://storage.googleapis.com/eleven-public-prod/UwDtqCF44YaL77wxb8DVQlHT5Gp1/voices/60G0VdAP3WBQQbE6tSkT/ecc00def-2543-4b50-b93d-5d4b6c7dca33.mp3,very bad,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,intense
-EkuRA6XL9UbflTWEtNbQ,Middle age Southern Male,https://storage.googleapis.com/eleven-public-prod/0gh9bWjaVmNOvQJVcRddxeYIS2z1/voices/t5Oo3tZSuEZt6BD2VGV4/5c0177c5-46bd-414c-abfd-6cd6d5677f08.mp3,medium,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,casual
 MP7UPhn7eVWqCGJGIh6Q,Aaron Patrick - Fun-Upbeat,https://storage.googleapis.com/eleven-public-prod/database/user/ktIm5hvnGlc2TVlwOiZmbmw9kHy2/voices/MP7UPhn7eVWqCGJGIh6Q/NFiMZncqQJ0IFTzFGbwQ.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,upbeat
 RPEIZnKMqlQiZyZd1Dae,Christopher - friendly guy next door,https://storage.googleapis.com/eleven-public-prod/database/user/HURZYaLa4shZEqiT75qd5tyEsSr1/voices/RPEIZnKMqlQiZyZd1Dae/FwLtZ4mCBHV0eLjbUM8Y.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,casual
 Tx7VLgfksXHVnoY6jDGU,"Conversational Joe - A chatty casual voice, British RP male",https://storage.googleapis.com/eleven-public-prod/database/user/wf6Rmje05ZbqeHYfK82ThsPKouC2/voices/Tx7VLgfksXHVnoY6jDGU/ab4X4F9RcNSeTwBS8KS9.mp3,ok,,admin,FALSE,FALSE,british,,middle_aged,male,conversational,en,casual

 IKne3meq5aSn9XLyUdCD,Charlie,https://storage.googleapis.com/eleven-public-prod/premade/voices/IKne3meq5aSn9XLyUdCD/102de6f2-22ed-43e0-a1f1-111fa75c5481.mp3,ok,,,FALSE,FALSE,australian,natural,middle_aged,male,conversational,,
 cjVigY5qzO86Huf0OWal,Eric,https://storage.googleapis.com/eleven-public-prod/premade/voices/cjVigY5qzO86Huf0OWal/d098fda0-6456-4030-b3d8-63aa048c9070.mp3,medium,,,FALSE,FALSE,american,friendly,middle_aged,male,conversational,,
 BFUk567oZITYKwOqegEq,Riley - loud and intense,https://storage.googleapis.com/eleven-public-prod/UwDtqCF44YaL77wxb8DVQlHT5Gp1/voices/60G0VdAP3WBQQbE6tSkT/ecc00def-2543-4b50-b93d-5d4b6c7dca33.mp3,very bad,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,intense
+EkuRA6XL9UbflTWEtNbQ,Middle age Southern Male,https://storage.googleapis.com/eleven-public-prod/0gh9bWjaVmNOvQJVcRddxeYIS2z1/voices/t5Oo3tZSuEZt6BD2VGV4/5c0177c5-46bd-414c-abfd-6cd6d5677f08.mp3,bad,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,,casual
 MP7UPhn7eVWqCGJGIh6Q,Aaron Patrick - Fun-Upbeat,https://storage.googleapis.com/eleven-public-prod/database/user/ktIm5hvnGlc2TVlwOiZmbmw9kHy2/voices/MP7UPhn7eVWqCGJGIh6Q/NFiMZncqQJ0IFTzFGbwQ.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,upbeat
 RPEIZnKMqlQiZyZd1Dae,Christopher - friendly guy next door,https://storage.googleapis.com/eleven-public-prod/database/user/HURZYaLa4shZEqiT75qd5tyEsSr1/voices/RPEIZnKMqlQiZyZd1Dae/FwLtZ4mCBHV0eLjbUM8Y.mp3,ok,,admin,FALSE,FALSE,american,,middle_aged,male,conversational,en,casual
 Tx7VLgfksXHVnoY6jDGU,"Conversational Joe - A chatty casual voice, British RP male",https://storage.googleapis.com/eleven-public-prod/database/user/wf6Rmje05ZbqeHYfK82ThsPKouC2/voices/Tx7VLgfksXHVnoY6jDGU/ab4X4F9RcNSeTwBS8KS9.mp3,ok,,admin,FALSE,FALSE,british,,middle_aged,male,conversational,en,casual

reviewed_voices.xlsx → data/reviewed_voices.xlsx RENAMED Viewed

File without changes

voices_to_consider.xlsx → data/voices_to_consider.xlsx RENAMED Viewed

File without changes

makefile ADDED Viewed

	@@ -0,0 +1,8 @@

+# install python dependencies in current environment
+install:
+	pip install -r requirements.txt
+# format python files
+format:
+	black .
+	isort .

pg.ipynb → notebooks/eda_voices.ipynb RENAMED Viewed

@@ -1,5 +1,21 @@
 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -12,29 +28,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "\n",
     "import dotenv\n",
-    "import pandas as pd\n",
-    "from httpx import Timeout\n",
-    "from pydantic import BaseModel\n",
-    "from langchain_core.prompts import (\n",
-    "    ChatPromptTemplate,\n",
-    "    SystemMessagePromptTemplate,\n",
-    "    HumanMessagePromptTemplate,\n",
-    ")\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "from langchain_community.callbacks import get_openai_callback\n",
-    "\n",
-    "import data.samples_to_split as samples\n",
-    "\n",
-    "from src.lc_callbacks import LCMessageLoggerAsync\n",
-    "from src.utils import GPTModels\n",
-    "from src.text_split_chain import create_split_text_chain"
    ]
   },
   {
@@ -830,268 +829,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## split text into character phrases"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-10 02:34:52,755 [INFO] audio-books (lc_callbacks.py): call to <failed to determine LLM> with 2 messages:\n",
-      "{'role': 'system', 'content': 'you are provided with the book sample.\\nplease rewrite it and insert xml tags indicating character to whom current phrase belongs.\\nfor example: <narrator>I looked at her</narrator><Jill>What are you looking at?</Jill>\\n\\nNotes:\\n- sometimes narrator is one of characters taking part in the action.\\nin this case use narrator\\'s name (if available) instead of \"narrator\"\\n- if it\\'s impossible to identify character name from the text provided, use codes \"c1\", \"c2\", etc,\\nwhere \"c\" prefix means character and number is used to enumerate unknown characters\\n- all quotes of direct speech must be attributed to characters, for example:\\n<Tom>“She’s a nice girl,”</Tom><narrator>said Tom after a moment.</narrator>\\nmind that sometimes narrator could also be a character.\\n- use ALL available context to determine the character.\\nsometimes the character name becomes clear from the following phrases\\n- DO NOT include in your response anything except for the original text with character xml tags!!!\\n'}\n",
-      "{'role': 'human', 'content': 'Here is the book sample:\\n---\\nInside, the crimson room bloomed with light. Tom and Miss Baker sat at\\neither end of the long couch and she read aloud to him from the\\nSaturday Evening Post—the words, murmurous and uninflected, running\\ntogether in a soothing tune. The lamplight, bright on his boots and\\ndull on the autumn-leaf yellow of her hair, glinted along the paper as\\nshe turned a page with a flutter of slender muscles in her arms.\\n\\nWhen we came in she held us silent for a moment with a lifted hand.\\n\\n“To be continued,” she said, tossing the magazine on the table, “in\\nour very next issue.”\\n\\nHer body asserted itself with a restless movement of her knee, and she\\nstood up.\\n\\n“Ten o’clock,” she remarked, apparently finding the time on the\\nceiling. “Time for this good girl to go to bed.”\\n\\n“Jordan’s going to play in the tournament tomorrow,” explained Daisy,\\n“over at Westchester.”\\n\\n“Oh—you’re Jordan Baker.”\\n\\nI knew now why her face was familiar—its pleasing contemptuous\\nexpression had looked out at me from many rotogravure pictures of the\\nsporting life at Asheville and Hot Springs and Palm Beach. I had heard\\nsome story of her too, a critical, unpleasant story, but what it was I\\nhad forgotten long ago.\\n\\n“Good night,” she said softly. “Wake me at eight, won’t you.”\\n\\n“If you’ll get up.”\\n\\n“I will. Good night, Mr. Carraway. See you anon.”\\n\\n“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a\\nmarriage. Come over often, Nick, and I’ll sort of—oh—fling you\\ntogether. You know—lock you up accidentally in linen closets and push\\nyou out to sea in a boat, and all that sort of thing—”\\n\\n“Good night,” called Miss Baker from the stairs. “I haven’t heard a\\nword.”\\n\\n“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let\\nher run around the country this way.”\\n\\n“Who oughtn’t to?” inquired Daisy coldly.\\n\\n“Her family.”\\n\\n“Her family is one aunt about a thousand years old. Besides, Nick’s\\ngoing to look after her, aren’t you, Nick? She’s going to spend lots\\nof weekends out here this summer. I think the home influence will be\\nvery good for her.”\\n\\nDaisy and Tom looked at each other for a moment in silence.\\n\\n“Is she from New York?” I asked quickly.\\n\\n“From Louisville. Our white girlhood was passed together there. Our\\nbeautiful white—”\\n\\n“Did you give Nick a little heart to heart talk on the veranda?”\\ndemanded Tom suddenly.\\n\\n“Did I?” She looked at me. “I can’t seem to remember, but I think we\\ntalked about the Nordic race. Yes, I’m sure we did. It sort of crept\\nup on us and first thing you know—”\\n\\n“Don’t believe everything you hear, Nick,” he advised me.\\n'}\n",
-      "2024-10-10 02:35:04,369 [INFO] httpx (_client.py): HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "2024-10-10 02:35:04,383 [INFO] audio-books (lc_callbacks.py): raw LLM response: \"<narrator>Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.</narrator>\n",
-      "\n",
-      "<narrator>When we came in she held us silent for a moment with a lifted hand.</narrator>\n",
-      "\n",
-      "<Jordan>“To be continued,”</Jordan> <narrator>she said, tossing the magazine on the table,</narrator> <Jordan>“in our very next issue.”</Jordan>\n",
-      "\n",
-      "<narrator>Her body asserted itself with a restless movement of her knee, and she stood up.</narrator>\n",
-      "\n",
-      "<Jordan>“Ten o’clock,”</Jordan> <narrator>she remarked, apparently finding the time on the ceiling.</narrator> <Jordan>“Time for this good girl to go to bed.”</Jordan>\n",
-      "\n",
-      "<Daisy>“Jordan’s going to play in the tournament tomorrow,”</Daisy> <narrator>explained Daisy,</narrator> <Daisy>“over at Westchester.”</Daisy>\n",
-      "\n",
-      "<narrator>“Oh—you’re Jordan Baker.”</narrator>\n",
-      "\n",
-      "<narrator>I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.</narrator>\n",
-      "\n",
-      "<Jordan>“Good night,”</Jordan> <narrator>she said softly.</narrator> <Jordan>“Wake me at eight, won’t you.”</Jordan>\n",
-      "\n",
-      "<Daisy>“If you’ll get up.”</Daisy>\n",
-      "\n",
-      "<Jordan>“I will. Good night, Mr. Carraway. See you anon.”</Jordan>\n",
-      "\n",
-      "<Daisy>“Of course you will,”</Daisy> <narrator>confirmed Daisy.</narrator> <Daisy>“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”</Daisy>\n",
-      "\n",
-      "<Jordan>“Good night,”</Jordan> <narrator>called Miss Baker from the stairs.</narrator> <Jordan>“I haven’t heard a word.”</Jordan>\n",
-      "\n",
-      "<Tom>“She’s a nice girl,”</Tom> <narrator>said Tom after a moment.</narrator> <Tom>“They oughtn’t to let her run around the country this way.”</Tom>\n",
-      "\n",
-      "<Daisy>“Who oughtn’t to?”</Daisy> <narrator>inquired Daisy coldly.</narrator>\n",
-      "\n",
-      "<Tom>“Her family.”</Tom>\n",
-      "\n",
-      "<Daisy>“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”</Daisy>\n",
-      "\n",
-      "<narrator>Daisy and Tom looked at each other for a moment in silence.</narrator>\n",
-      "\n",
-      "<narrator>“Is she from New York?”</narrator> <narrator>I asked quickly.</narrator>\n",
-      "\n",
-      "<Daisy>“From Louisville. Our white girlhood was passed together there. Our beautiful white—”</Daisy>\n",
-      "\n",
-      "<Tom>“Did you give Nick a little heart to heart talk on the veranda?”</Tom> <narrator>demanded Tom suddenly.</narrator>\n",
-      "\n",
-      "<Daisy>“Did I?”</Daisy> <narrator>She looked at me.</narrator> <Daisy>“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”</Daisy>\n",
-      "\n",
-      "<Tom>“Don’t believe everything you hear, Nick,”</Tom> <narrator>he advised me.</narrator>\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)\n",
-    "# chain = create_split_text_chain(llm_model=GPTModels.GPT_4_TURBO_2024_04_09)\n",
-    "with get_openai_callback() as cb:\n",
-    "    res = chain.invoke(\n",
-    "        {\"text\": samples.GATSBY_2}, config={\"callbacks\": [LCMessageLoggerAsync()]}\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "SplitTextOutput(text_raw='Inside, the crimson room bloomed with light. Tom and Miss Baker sat at\\neither end of the long couch and she read aloud to him from the\\nSaturday Evening Post—the words, murmurous and uninflected, running\\ntogether in a soothing tune. The lamplight, bright on his boots and\\ndull on the autumn-leaf yellow of her hair, glinted along the paper as\\nshe turned a page with a flutter of slender muscles in her arms.\\n\\nWhen we came in she held us silent for a moment with a lifted hand.\\n\\n“To be continued,” she said, tossing the magazine on the table, “in\\nour very next issue.”\\n\\nHer body asserted itself with a restless movement of her knee, and she\\nstood up.\\n\\n“Ten o’clock,” she remarked, apparently finding the time on the\\nceiling. “Time for this good girl to go to bed.”\\n\\n“Jordan’s going to play in the tournament tomorrow,” explained Daisy,\\n“over at Westchester.”\\n\\n“Oh—you’re Jordan Baker.”\\n\\nI knew now why her face was familiar—its pleasing contemptuous\\nexpression had looked out at me from many rotogravure pictures of the\\nsporting life at Asheville and Hot Springs and Palm Beach. I had heard\\nsome story of her too, a critical, unpleasant story, but what it was I\\nhad forgotten long ago.\\n\\n“Good night,” she said softly. “Wake me at eight, won’t you.”\\n\\n“If you’ll get up.”\\n\\n“I will. Good night, Mr. Carraway. See you anon.”\\n\\n“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a\\nmarriage. Come over often, Nick, and I’ll sort of—oh—fling you\\ntogether. You know—lock you up accidentally in linen closets and push\\nyou out to sea in a boat, and all that sort of thing—”\\n\\n“Good night,” called Miss Baker from the stairs. “I haven’t heard a\\nword.”\\n\\n“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let\\nher run around the country this way.”\\n\\n“Who oughtn’t to?” inquired Daisy coldly.\\n\\n“Her family.”\\n\\n“Her family is one aunt about a thousand years old. Besides, Nick’s\\ngoing to look after her, aren’t you, Nick? She’s going to spend lots\\nof weekends out here this summer. I think the home influence will be\\nvery good for her.”\\n\\nDaisy and Tom looked at each other for a moment in silence.\\n\\n“Is she from New York?” I asked quickly.\\n\\n“From Louisville. Our white girlhood was passed together there. Our\\nbeautiful white—”\\n\\n“Did you give Nick a little heart to heart talk on the veranda?”\\ndemanded Tom suddenly.\\n\\n“Did I?” She looked at me. “I can’t seem to remember, but I think we\\ntalked about the Nordic race. Yes, I’m sure we did. It sort of crept\\nup on us and first thing you know—”\\n\\n“Don’t believe everything you hear, Nick,” he advised me.\\n', text_annotated='<narrator>Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.</narrator>\\n\\n<narrator>When we came in she held us silent for a moment with a lifted hand.</narrator>\\n\\n<Jordan>“To be continued,”</Jordan> <narrator>she said, tossing the magazine on the table,</narrator> <Jordan>“in our very next issue.”</Jordan>\\n\\n<narrator>Her body asserted itself with a restless movement of her knee, and she stood up.</narrator>\\n\\n<Jordan>“Ten o’clock,”</Jordan> <narrator>she remarked, apparently finding the time on the ceiling.</narrator> <Jordan>“Time for this good girl to go to bed.”</Jordan>\\n\\n<Daisy>“Jordan’s going to play in the tournament tomorrow,”</Daisy> <narrator>explained Daisy,</narrator> <Daisy>“over at Westchester.”</Daisy>\\n\\n<narrator>“Oh—you’re Jordan Baker.”</narrator>\\n\\n<narrator>I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.</narrator>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>she said softly.</narrator> <Jordan>“Wake me at eight, won’t you.”</Jordan>\\n\\n<Daisy>“If you’ll get up.”</Daisy>\\n\\n<Jordan>“I will. Good night, Mr. Carraway. See you anon.”</Jordan>\\n\\n<Daisy>“Of course you will,”</Daisy> <narrator>confirmed Daisy.</narrator> <Daisy>“In fact I think I’ll arrange a marriage. Come over often, Nick, and I���ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”</Daisy>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>called Miss Baker from the stairs.</narrator> <Jordan>“I haven’t heard a word.”</Jordan>\\n\\n<Tom>“She’s a nice girl,”</Tom> <narrator>said Tom after a moment.</narrator> <Tom>“They oughtn’t to let her run around the country this way.”</Tom>\\n\\n<Daisy>“Who oughtn’t to?”</Daisy> <narrator>inquired Daisy coldly.</narrator>\\n\\n<Tom>“Her family.”</Tom>\\n\\n<Daisy>“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”</Daisy>\\n\\n<narrator>Daisy and Tom looked at each other for a moment in silence.</narrator>\\n\\n<narrator>“Is she from New York?”</narrator> <narrator>I asked quickly.</narrator>\\n\\n<Daisy>“From Louisville. Our white girlhood was passed together there. Our beautiful white—”</Daisy>\\n\\n<Tom>“Did you give Nick a little heart to heart talk on the veranda?”</Tom> <narrator>demanded Tom suddenly.</narrator>\\n\\n<Daisy>“Did I?”</Daisy> <narrator>She looked at me.</narrator> <Daisy>“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”</Daisy>\\n\\n<Tom>“Don’t believe everything you hear, Nick,”</Tom> <narrator>he advised me.</narrator>')"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "res"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Tom', 'Jordan', 'Daisy', 'narrator']"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "res.characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<narrator>Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.</narrator>\n",
-      "\n",
-      "<narrator>When we came in she held us silent for a moment with a lifted hand.</narrator>\n",
-      "\n",
-      "<Jordan>“To be continued,”</Jordan> <narrator>she said, tossing the magazine on the table,</narrator> <Jordan>“in our very next issue.”</Jordan>\n",
-      "\n",
-      "<narrator>Her body asserted itself with a restless movement of her knee, and she stood up.</narrator>\n",
-      "\n",
-      "<Jordan>“Ten o’clock,”</Jordan> <narrator>she remarked, apparently finding the time on the ceiling.</narrator> <Jordan>“Time for this good girl to go to bed.”</Jordan>\n",
-      "\n",
-      "<Daisy>“Jordan’s going to play in the tournament tomorrow,”</Daisy> <narrator>explained Daisy,</narrator> <Daisy>“over at Westchester.”</Daisy>\n",
-      "\n",
-      "<narrator>“Oh—you’re Jordan Baker.”</narrator>\n",
-      "\n",
-      "<narrator>I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.</narrator>\n",
-      "\n",
-      "<Jordan>“Good night,”</Jordan> <narrator>she said softly.</narrator> <Jordan>“Wake me at eight, won’t you.”</Jordan>\n",
-      "\n",
-      "<Daisy>“If you’ll get up.”</Daisy>\n",
-      "\n",
-      "<Jordan>“I will. Good night, Mr. Carraway. See you anon.”</Jordan>\n",
-      "\n",
-      "<Daisy>“Of course you will,”</Daisy> <narrator>confirmed Daisy.</narrator> <Daisy>“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”</Daisy>\n",
-      "\n",
-      "<Jordan>“Good night,”</Jordan> <narrator>called Miss Baker from the stairs.</narrator> <Jordan>“I haven’t heard a word.”</Jordan>\n",
-      "\n",
-      "<Tom>“She’s a nice girl,”</Tom> <narrator>said Tom after a moment.</narrator> <Tom>“They oughtn’t to let her run around the country this way.”</Tom>\n",
-      "\n",
-      "<Daisy>“Who oughtn’t to?”</Daisy> <narrator>inquired Daisy coldly.</narrator>\n",
-      "\n",
-      "<Tom>“Her family.”</Tom>\n",
-      "\n",
-      "<Daisy>“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”</Daisy>\n",
-      "\n",
-      "<narrator>Daisy and Tom looked at each other for a moment in silence.</narrator>\n",
-      "\n",
-      "<narrator>“Is she from New York?”</narrator> <narrator>I asked quickly.</narrator>\n",
-      "\n",
-      "<Daisy>“From Louisville. Our white girlhood was passed together there. Our beautiful white—”</Daisy>\n",
-      "\n",
-      "<Tom>“Did you give Nick a little heart to heart talk on the veranda?”</Tom> <narrator>demanded Tom suddenly.</narrator>\n",
-      "\n",
-      "<Daisy>“Did I?”</Daisy> <narrator>She looked at me.</narrator> <Daisy>“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”</Daisy>\n",
-      "\n",
-      "<Tom>“Don’t believe everything you hear, Nick,”</Tom> <narrator>he advised me.</narrator>\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(res.text_annotated)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "characters: ['Tom', 'Jordan', 'Daisy', 'narrator']\n",
-      "--------------------\n",
-      "[narrator] Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n",
-      "[narrator] When we came in she held us silent for a moment with a lifted hand.\n",
-      "[Jordan] “To be continued,”\n",
-      "[narrator] she said, tossing the magazine on the table,\n",
-      "[Jordan] “in our very next issue.”\n",
-      "[narrator] Her body asserted itself with a restless movement of her knee, and she stood up.\n",
-      "[Jordan] “Ten o’clock,”\n",
-      "[narrator] she remarked, apparently finding the time on the ceiling.\n",
-      "[Jordan] “Time for this good girl to go to bed.”\n",
-      "[Daisy] “Jordan’s going to play in the tournament tomorrow,”\n",
-      "[narrator] explained Daisy,\n",
-      "[Daisy] “over at Westchester.”\n",
-      "[narrator] “Oh—you’re Jordan Baker.”\n",
-      "[narrator] I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n",
-      "[Jordan] “Good night,”\n",
-      "[narrator] she said softly.\n",
-      "[Jordan] “Wake me at eight, won’t you.”\n",
-      "[Daisy] “If you’ll get up.”\n",
-      "[Jordan] “I will. Good night, Mr. Carraway. See you anon.”\n",
-      "[Daisy] “Of course you will,”\n",
-      "[narrator] confirmed Daisy.\n",
-      "[Daisy] “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n",
-      "[Jordan] “Good night,”\n",
-      "[narrator] called Miss Baker from the stairs.\n",
-      "[Jordan] “I haven’t heard a word.”\n",
-      "[Tom] “She’s a nice girl,”\n",
-      "[narrator] said Tom after a moment.\n",
-      "[Tom] “They oughtn’t to let her run around the country this way.”\n",
-      "[Daisy] “Who oughtn’t to?”\n",
-      "[narrator] inquired Daisy coldly.\n",
-      "[Tom] “Her family.”\n",
-      "[Daisy] “Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n",
-      "[narrator] Daisy and Tom looked at each other for a moment in silence.\n",
-      "[narrator] “Is she from New York?”\n",
-      "[narrator] I asked quickly.\n",
-      "[Daisy] “From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n",
-      "[Tom] “Did you give Nick a little heart to heart talk on the veranda?”\n",
-      "[narrator] demanded Tom suddenly.\n",
-      "[Daisy] “Did I?”\n",
-      "[narrator] She looked at me.\n",
-      "[Daisy] “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n",
-      "[Tom] “Don’t believe everything you hear, Nick,”\n",
-      "[narrator] he advised me.\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(res.to_pretty_text())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LLM usage:\n",
-      "\n",
-      "Tokens Used: 1817\n",
-      "\tPrompt Tokens: 877\n",
-      "\tCompletion Tokens: 940\n",
-      "Successful Requests: 1\n",
-      "Total Cost (USD): $0.0115925\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f'LLM usage:\\n\\n{cb}')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1099,192 +836,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## map characters to voices"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from src.select_voice_chain import create_voice_mapping_chain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = create_voice_mapping_chain(llm_model=GPTModels.GPT_4_TURBO_2024_04_09)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ChatPromptTemplate(input_variables=['characters', 'text'], input_types={}, partial_variables={'available_genders': '\"male\", \"female\"', 'available_age_groups': '\"old\", \"middle_aged\", \"young\"', 'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"$defs\": {\"CharacterProperties\": {\"properties\": {\"gender\": {\"title\": \"Gender\", \"type\": \"string\"}, \"age_group\": {\"title\": \"Age Group\", \"type\": \"string\"}}, \"required\": [\"gender\", \"age_group\"], \"title\": \"CharacterProperties\", \"type\": \"object\"}}, \"properties\": {\"character2props\": {\"additionalProperties\": {\"$ref\": \"#/$defs/CharacterProperties\"}, \"title\": \"Character2Props\", \"type\": \"object\"}}, \"required\": [\"character2props\"]}\\n```'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['available_age_groups', 'available_genders', 'format_instructions'], input_types={}, partial_variables={}, template='You are a helpful assistant proficient in literature and psychology.\\nOur goal is to create an audio book from the given text.\\nFor that we need to hire voice actors.\\nPlease help us to find the right actor for each character present in the text.\\n\\nYou are provided with the text split by the characters\\nto whom text parts belong to.\\n\\nYour task is to assign available properties to each character provided.\\nList of available properties:\\n- gender: {available_genders}\\n- age_group: {available_age_groups}\\n\\nNOTES:\\n- assign EXACTLY ONE property value for each property\\n- select properties values ONLY from the list of AVAILABLE property values\\n- fill properties for ALL characters from the list provided\\n- DO NOT include any characters absent in the list provided\\n\\n{format_instructions}\\n'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['characters', 'text'], input_types={}, partial_variables={}, template='<text>\\n{text}\\n</text>\\n\\n<characters>\\n{characters}\\n</characters>\\n'), additional_kwargs={})])\n",
-       "| RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x174a82d80>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x174a812e0>, root_client=<openai.OpenAI object at 0x174a82d50>, root_async_client=<openai.AsyncOpenAI object at 0x174a81730>, model_name='gpt-4-turbo-2024-04-09', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), request_timeout=Timeout(connect=4, read=60, write=60, pool=60)), kwargs={'response_format': {'type': 'json_object'}}, config={}, config_factories=[])\n",
-       "| PydanticOutputParser(pydantic_object=<class 'src.select_voice_chain.AllCharactersProperties'>)"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-10 02:37:46,347 [INFO] audio-books (lc_callbacks.py): call to gpt-4-turbo-2024-04-09 with 2 messages:\n",
-      "{'role': 'system', 'content': 'You are a helpful assistant proficient in literature and psychology.\\nOur goal is to create an audio book from the given text.\\nFor that we need to hire voice actors.\\nPlease help us to find the right actor for each character present in the text.\\n\\nYou are provided with the text split by the characters\\nto whom text parts belong to.\\n\\nYour task is to assign available properties to each character provided.\\nList of available properties:\\n- gender: \"male\", \"female\"\\n- age_group: \"old\", \"middle_aged\", \"young\"\\n\\nNOTES:\\n- assign EXACTLY ONE property value for each property\\n- select properties values ONLY from the list of AVAILABLE property values\\n- fill properties for ALL characters from the list provided\\n- DO NOT include any characters absent in the list provided\\n\\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"$defs\": {\"CharacterProperties\": {\"properties\": {\"gender\": {\"title\": \"Gender\", \"type\": \"string\"}, \"age_group\": {\"title\": \"Age Group\", \"type\": \"string\"}}, \"required\": [\"gender\", \"age_group\"], \"title\": \"CharacterProperties\", \"type\": \"object\"}}, \"properties\": {\"character2props\": {\"additionalProperties\": {\"$ref\": \"#/$defs/CharacterProperties\"}, \"title\": \"Character2Props\", \"type\": \"object\"}}, \"required\": [\"character2props\"]}\\n```\\n'}\n",
-      "{'role': 'human', 'content': \"<text>\\n<narrator>Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.</narrator>\\n\\n<narrator>When we came in she held us silent for a moment with a lifted hand.</narrator>\\n\\n<Jordan>“To be continued,”</Jordan> <narrator>she said, tossing the magazine on the table,</narrator> <Jordan>“in our very next issue.”</Jordan>\\n\\n<narrator>Her body asserted itself with a restless movement of her knee, and she stood up.</narrator>\\n\\n<Jordan>“Ten o’clock,”</Jordan> <narrator>she remarked, apparently finding the time on the ceiling.</narrator> <Jordan>“Time for this good girl to go to bed.”</Jordan>\\n\\n<Daisy>“Jordan’s going to play in the tournament tomorrow,”</Daisy> <narrator>explained Daisy,</narrator> <Daisy>“over at Westchester.”</Daisy>\\n\\n<narrator>“Oh—you’re Jordan Baker.”</narrator>\\n\\n<narrator>I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.</narrator>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>she said softly.</narrator> <Jordan>“Wake me at eight, won’t you.”</Jordan>\\n\\n<Daisy>“If you’ll get up.”</Daisy>\\n\\n<Jordan>“I will. Good night, Mr. Carraway. See you anon.”</Jordan>\\n\\n<Daisy>“Of course you will,”</Daisy> <narrator>confirmed Daisy.</narrator> <Daisy>“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”</Daisy>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>called Miss Baker from the stairs.</narrator> <Jordan>“I haven’t heard a word.”</Jordan>\\n\\n<Tom>“She’s a nice girl,”</Tom> <narrator>said Tom after a moment.</narrator> <Tom>“They oughtn’t to let her run around the country this way.”</Tom>\\n\\n<Daisy>“Who oughtn’t to?”</Daisy> <narrator>inquired Daisy coldly.</narrator>\\n\\n<Tom>“Her family.”</Tom>\\n\\n<Daisy>“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”</Daisy>\\n\\n<narrator>Daisy and Tom looked at each other for a moment in silence.</narrator>\\n\\n<narrator>“Is she from New York?”</narrator> <narrator>I asked quickly.</narrator>\\n\\n<Daisy>“From Louisville. Our white girlhood was passed together there. Our beautiful white—”</Daisy>\\n\\n<Tom>“Did you give Nick a little heart to heart talk on the veranda?”</Tom> <narrator>demanded Tom suddenly.</narrator>\\n\\n<Daisy>“Did I?”</Daisy> <narrator>She looked at me.</narrator> <Daisy>“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”</Daisy>\\n\\n<Tom>“Don’t believe everything you hear, Nick,”</Tom> <narrator>he advised me.</narrator>\\n</text>\\n\\n<characters>\\n['Tom', 'Jordan', 'Daisy', 'narrator']\\n</characters>\\n\"}\n",
-      "2024-10-10 02:37:52,060 [INFO] httpx (_client.py): HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
-      "2024-10-10 02:37:52,063 [INFO] audio-books (lc_callbacks.py): raw LLM response: \"{\n",
-      "  \"character2props\": {\n",
-      "    \"Tom\": {\n",
-      "      \"gender\": \"male\",\n",
-      "      \"age_group\": \"middle_aged\"\n",
-      "    },\n",
-      "    \"Jordan\": {\n",
-      "      \"gender\": \"female\",\n",
-      "      \"age_group\": \"young\"\n",
-      "    },\n",
-      "    \"Daisy\": {\n",
-      "      \"gender\": \"female\",\n",
-      "      \"age_group\": \"young\"\n",
-      "    },\n",
-      "    \"narrator\": {\n",
-      "      \"gender\": \"male\",\n",
-      "      \"age_group\": \"middle_aged\"\n",
-      "    }\n",
-      "  }\n",
-      "}\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "res2 = chain.invoke(\n",
-    "    {\"text\": res.text_annotated, \"characters\": res.characters},\n",
-    "    config={\"callbacks\": [LCMessageLoggerAsync()]},\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "AllCharactersProperties(character2props={'Tom': CharacterProperties(gender='male', age_group='middle_aged'), 'Jordan': CharacterProperties(gender='female', age_group='young'), 'Daisy': CharacterProperties(gender='female', age_group='young'), 'narrator': CharacterProperties(gender='male', age_group='middle_aged')})"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "res2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 22 entries, 0 to 21\n",
-      "Data columns (total 14 columns):\n",
-      " #   Column                  Non-Null Count  Dtype  \n",
-      "---  ------                  --------------  -----  \n",
-      " 0   voice_id                22 non-null     object \n",
-      " 1   name                    22 non-null     object \n",
-      " 2   preview_url             22 non-null     object \n",
-      " 3   owner_id                0 non-null      float64\n",
-      " 4   permission_on_resource  2 non-null      object \n",
-      " 5   is_legacy               22 non-null     bool   \n",
-      " 6   is_mixed                22 non-null     bool   \n",
-      " 7   accent                  22 non-null     object \n",
-      " 8   description             20 non-null     object \n",
-      " 9   age                     22 non-null     object \n",
-      " 10  gender                  22 non-null     object \n",
-      " 11  category                22 non-null     object \n",
-      " 12  language                2 non-null      object \n",
-      " 13  descriptive             2 non-null      object \n",
-      "dtypes: bool(2), float64(1), object(11)\n",
-      "memory usage: 2.2+ KB\n"
-     ]
-    }
-   ],
-   "source": [
-    "voices = pd.read_csv(\"11labs_available_tts_voices.csv\")\n",
-    "voices.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['middle_aged', 'young', 'old'], dtype=object)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "voices[\"age\"].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['female', 'male', 'non-binary', 'neutral'], dtype=object)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "voices[\"gender\"].unique()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## initialize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd .."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "import dotenv\n",
+    "import pandas as pd"
    ]
   },
   {
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,

filter_voices.ipynb → notebooks/filter_voices.ipynb RENAMED Viewed

@@ -1,5 +1,14 @@
 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 1,

 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd .."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,

notebooks/playground.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,8 @@

+[tool.black]
+line-length = 100
+target-version = ['py311']
+skip-string-normalization = true
+[tool.isort]
+profile = "black"
+line_length = 100

requirements.txt CHANGED Viewed

@@ -9,4 +9,6 @@ elevenlabs
 gradio
 python-dotenv
 streamlit
-pypdf

 gradio
 python-dotenv
 streamlit
+pypdf
+black
+isort

scripts/add_voices.py CHANGED Viewed

@@ -7,7 +7,6 @@ from elevenlabs import ElevenLabs
 from elevenlabs.core import ApiError
 from tqdm.auto import tqdm
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s (%(filename)s): %(message)s",
@@ -23,9 +22,7 @@ load_dotenv()
 @click.option("-i", "--input-csv-path", default="data/11labs_tts_voices.csv")
 def main(*, api_key: str | None, input_csv_path: str) -> None:
     if api_key is None:
-        raise OSError(
-            "Who's gonna set the `ELEVEN_LABS_API_KEY` environmental variable?"
-        )
     client = ElevenLabs(api_key=api_key)
     voices_to_import = pd.read_csv(input_csv_path)
@@ -39,13 +36,11 @@ def main(*, api_key: str | None, input_csv_path: str) -> None:
             )
         except ApiError:
             logger.error(
-                f"Shared voice with `{public_user_id = }`, `{voice_id = }` "
-                "already added."
             )
         else:
             logger.info(
-                f"Added shared voice with `{public_user_id = }`, `{voice_id = }`, "
-                f"`{name = }`."
             )

 from elevenlabs.core import ApiError
 from tqdm.auto import tqdm
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s (%(filename)s): %(message)s",
 @click.option("-i", "--input-csv-path", default="data/11labs_tts_voices.csv")
 def main(*, api_key: str | None, input_csv_path: str) -> None:
     if api_key is None:
+        raise OSError("Who's gonna set the `ELEVEN_LABS_API_KEY` environmental variable?")
     client = ElevenLabs(api_key=api_key)
     voices_to_import = pd.read_csv(input_csv_path)
             )
         except ApiError:
             logger.error(
+                f"Shared voice with `{public_user_id = }`, `{voice_id = }` " "already added."
             )
         else:
             logger.info(
+                f"Added shared voice with `{public_user_id = }`, `{voice_id = }`, " f"`{name = }`."
             )

scripts/export_available_voices.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s (%(filename)s): %(message)s",
@@ -22,9 +21,7 @@ load_dotenv()
 @click.option("-o", "--output-csv-path", default="data/11labs_available_tts_voices.csv")
 def main(*, api_key: str | None, output_csv_path: str) -> None:
     if api_key is None:
-        raise OSError(
-            "Who's gonna set the `ELEVEN_LABS_API_KEY` environmental variable?"
-        )
     client = ElevenLabs(api_key=api_key)
     response = client.voices.get_all()

 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s (%(filename)s): %(message)s",
 @click.option("-o", "--output-csv-path", default="data/11labs_available_tts_voices.csv")
 def main(*, api_key: str | None, output_csv_path: str) -> None:
     if api_key is None:
+        raise OSError("Who's gonna set the `ELEVEN_LABS_API_KEY` environmental variable?")
     client = ElevenLabs(api_key=api_key)
     response = client.voices.get_all()

src/audio_generators.py DELETED Viewed

@@ -1,315 +0,0 @@
-import asyncio
-import os
-import re
-from pathlib import Path
-from uuid import uuid4
-import random
-from langchain_community.callbacks import get_openai_callback
-from pydub import AudioSegment
-from src.lc_callbacks import LCMessageLoggerAsync
-from src.tts import tts_astream_consumed, sound_generation_consumed
-from src.utils import consume_aiter
-from src.emotions.generation import (
-    EffectGeneratorAsync,
-    TextPreparationForTTSTaskOutput,
-)
-from src.emotions.utils import add_overlay_for_audio
-from src.config import ELEVENLABS_MAX_PARALLEL, logger, OPENAI_MAX_PARALLEL
-from src.text_split_chain import SplitTextOutput
-class AudioGeneratorSimple:
-    async def generate_audio(
-        self,
-        text_split: SplitTextOutput,
-        character_to_voice: dict[str, str],
-    ) -> Path:
-        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
-        async def tts_astream_with_semaphore(voice_id: str, text: str):
-            async with semaphore:
-                bytes_ = await tts_astream_consumed(voice_id=voice_id, text=text)
-                # bytes_ = await consume_aiter(iter_)
-                return bytes_
-        tasks = []
-        for character_phrase in text_split.phrases:
-            voice_id = character_to_voice[character_phrase.character]
-            task = tts_astream_with_semaphore(
-                voice_id=voice_id, text=character_phrase.text
-            )
-            tasks.append(task)
-        results = await asyncio.gather(*tasks)
-        save_dir = Path("data") / "books"
-        save_dir.mkdir(exist_ok=True)
-        audio_combined_fp = save_dir / f"{uuid4()}.wav"
-        logger.info(f'saving generated audio book to: "{audio_combined_fp}"')
-        with open(audio_combined_fp, "wb") as ab:
-            for result in results:
-                for chunk in result:
-                    ab.write(chunk)
-        return audio_combined_fp
-class AudioGeneratorWithEffects:
-    def __init__(self):
-        self.effect_generator = EffectGeneratorAsync(predict_duration=True)
-        self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
-        self.temp_files = []
-    async def generate_audio(
-        self,
-        text_split: SplitTextOutput,
-        character_to_voice: dict[str, str],
-        out_path: Path | None = None,
-        *,
-        generate_effects: bool = True,
-    ) -> Path:
-        """Main method to generate the audiobook with TTS, emotion, and sound effects."""
-        num_lines = len(text_split.phrases)
-        lines_for_sound_effect = self._select_lines_for_sound_effect(
-            num_lines,
-            fraction=float(0.2 * generate_effects),
-        )
-        logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
-        data_for_tts, data_for_sound_effects = await self._prepare_text_for_tts(
-            text_split, lines_for_sound_effect
-        )
-        tts_results, self.temp_files = await self._generate_tts_audio(
-            text_split, data_for_tts, character_to_voice
-        )
-        audio_chunks = await self._add_sound_effects(
-            tts_results, lines_for_sound_effect, data_for_sound_effects, self.temp_files
-        )
-        normalized_audio_chunks = self._normalize_audio_chunks(
-            audio_chunks, self.temp_files
-        )
-        final_output = self._merge_audio_files(
-            normalized_audio_chunks, save_path=out_path
-        )
-        self._cleanup_temp_files(self.temp_files)
-        return final_output
-    def _select_lines_for_sound_effect(
-        self, num_lines: int, fraction: float
-    ) -> list[int]:
-        """Select % of the lines randomly for sound effect generation."""
-        return random.sample(range(num_lines), k=int(fraction * num_lines))
-    async def _prepare_text_for_tts(
-        self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
-    ) -> tuple[list[dict], list[dict]]:
-        semaphore = asyncio.Semaphore(OPENAI_MAX_PARALLEL)
-        async def run_task_with_semaphore(func, **params):
-            async with semaphore:
-                outputs = await func(**params)
-                return outputs
-        task_emotion_code = "add_emotion"
-        task_effects_code = "add_effects"
-        tasks = []
-        for idx, character_phrase in enumerate(text_split.phrases):
-            character_text = character_phrase.text.strip().lower()
-            tasks.append(
-                run_task_with_semaphore(
-                    func=self.effect_generator.add_emotion_to_text,
-                    text=character_text,
-                )
-            )
-            # If this line needs sound effects, generate parameters
-            if idx in lines_for_sound_effect:
-                tasks.append(
-                    run_task_with_semaphore(
-                        func=self.effect_generator.generate_parameters_for_sound_effect,
-                        text=character_text,
-                    )
-                )
-        tasks_results: list[TextPreparationForTTSTaskOutput] = []
-        tasks_results = await asyncio.gather(*tasks)
-        emotion_tasks_results = [
-            x.output for x in tasks_results if x.task == task_emotion_code
-        ]
-        effects_tasks_results = [
-            x.output for x in tasks_results if x.task == task_effects_code
-        ]
-        return emotion_tasks_results, effects_tasks_results
-    async def _generate_tts_audio(
-        self,
-        text_split: SplitTextOutput,
-        data_for_tts: list[dict],
-        character_to_voice: dict[str, str],
-    ) -> tuple[list[str], list[str]]:
-        """Generate TTS audio for modified text."""
-        tasks_for_tts = []
-        temp_files = []
-        async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
-            async with self.semaphore:
-                bytes_ = await tts_astream_consumed(
-                    voice_id=voice_id, text=text, params=params
-                )
-                # bytes_ = await consume_aiter(iter_)
-                return bytes_
-        for idx, (data_item, character_phrase) in enumerate(
-            zip(data_for_tts, text_split.phrases)
-        ):
-            voice_id = character_to_voice[character_phrase.character]
-            task = tts_astream_with_semaphore(
-                voice_id=voice_id,
-                text=data_item["modified_text"],
-                params=data_item["params"],
-            )
-            tasks_for_tts.append(task)
-        tts_results = await asyncio.gather(*tasks_for_tts)
-        # Save the results to temporary files
-        tts_audio_files = []
-        for idx, tts_result in enumerate(tts_results):
-            tts_filename = f"tts_output_{idx}.wav"
-            with open(tts_filename, "wb") as ab:
-                for chunk in tts_result:
-                    ab.write(chunk)
-            tts_audio_files.append(tts_filename)
-            temp_files.append(tts_filename)
-        return tts_audio_files, temp_files
-    async def _add_sound_effects(
-        self,
-        tts_audio_files: list[str],
-        lines_for_sound_effect: list[int],
-        data_for_sound_effects: list[dict],
-        temp_files: list[str],
-    ) -> list[str]:
-        """Add sound effects to the selected lines."""
-        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
-        async def _process_single_phrase(
-            tts_filename: str,
-            sound_effect_data: dict | None,
-            sound_effect_filename: str,
-        ):
-            if sound_effect_data is None:
-                return (tts_filename, [])
-            async with semaphore:
-                sound_result = await sound_generation_consumed(sound_effect_data)
-            # save to file
-            with open(sound_effect_filename, "wb") as ab:
-                for chunk in sound_result:
-                    ab.write(chunk)
-            # overlay sound effect on TTS audio
-            tts_with_effects_filename = add_overlay_for_audio(
-                main_audio_filename=tts_filename,
-                sound_effect_filename=sound_effect_filename,
-                cycling_effect=True,
-                decrease_effect_volume=5,
-            )
-            tmp_files = [sound_effect_filename, tts_with_effects_filename]
-            return (tts_with_effects_filename, tmp_files)
-        tasks = []
-        for idx, tts_filename in enumerate(tts_audio_files):
-            sound_effect_filename = f"sound_effect_{idx}.wav"
-            if idx not in lines_for_sound_effect:
-                tasks.append(
-                    _process_single_phrase(
-                        tts_filename=tts_filename,
-                        sound_effect_data=None,
-                        sound_effect_filename=sound_effect_filename,
-                    )
-                )
-            else:
-                sound_effect_data = data_for_sound_effects.pop(0)
-                tasks.append(
-                    _process_single_phrase(
-                        tts_filename=tts_filename,
-                        sound_effect_data=sound_effect_data,
-                        sound_effect_filename=sound_effect_filename,
-                    )
-                )
-        outputs = await asyncio.gather(*tasks)
-        audio_chunks = [x[0] for x in outputs]
-        tmp_files_to_add = [item for x in outputs for item in x[1]]
-        temp_files.extend(tmp_files_to_add)
-        return audio_chunks
-    def _normalize_audio(
-        self, audio_segment: AudioSegment, target_dBFS: float = -20.0
-    ) -> AudioSegment:
-        """Normalize an audio segment to the target dBFS level."""
-        change_in_dBFS = target_dBFS - audio_segment.dBFS
-        return audio_segment.apply_gain(change_in_dBFS)
-    def _normalize_audio_chunks(
-        self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0
-    ) -> list[str]:
-        """Normalize all audio chunks to the target volume level."""
-        normalized_files = []
-        for audio_file in audio_filenames:
-            audio_segment = AudioSegment.from_file(audio_file)
-            normalized_audio = self._normalize_audio(audio_segment, target_dBFS)
-            normalized_filename = f"normalized_{Path(audio_file).stem}.wav"
-            normalized_audio.export(normalized_filename, format="wav")
-            normalized_files.append(normalized_filename)
-            temp_files.append(normalized_filename)
-        return normalized_files
-    def _merge_audio_files(
-        self, audio_filenames: list[str], save_path: Path | None = None
-    ) -> Path:
-        """Helper function to merge multiple audio files into one."""
-        combined = AudioSegment.from_file(audio_filenames[0])
-        for filename in audio_filenames[1:]:
-            next_audio = AudioSegment.from_file(filename)
-            combined += next_audio  # Concatenate the audio
-        if save_path is None:
-            save_dir = Path("data") / "books"
-            save_dir.mkdir(exist_ok=True)
-            save_path = save_dir / f"{uuid4()}.wav"
-        combined.export(save_path, format="wav")
-        return Path(save_path)
-    def _cleanup_temp_files(self, temp_files: list[str]) -> None:
-        """Helper function to delete all temporary files."""
-        for temp_file in temp_files:
-            try:
-                os.remove(temp_file)
-            except FileNotFoundError:
-                continue

src/builder.py CHANGED Viewed

@@ -1,32 +1,100 @@
 from langchain_community.callbacks import get_openai_callback
-from src.audio_generators import AudioGeneratorWithEffects
 from src.lc_callbacks import LCMessageLoggerAsync
-from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
 from src.text_split_chain import SplitTextOutput, create_split_text_chain
-from src.utils import GPTModels
-class AudiobookBuilder:
-    def __init__(self):
         self.voice_selector = VoiceSelector()
-        self.audio_generator = AudioGeneratorWithEffects()
-    async def split_text(self, text: str) -> SplitTextOutput:
         chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
         with get_openai_callback() as cb:
             chain_out = await chain.ainvoke(
                 {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
             )
         return chain_out
-    async def map_characters_to_voices(
         self, text_split: SplitTextOutput
     ) -> SelectVoiceChainOutput:
-        chain = self.voice_selector.create_voice_mapping_chain(
-            llm_model=GPTModels.GPT_4o
-        )
         with get_openai_callback() as cb:
             chain_out = await chain.ainvoke(
                 {
@@ -35,17 +103,495 @@ class AudiobookBuilder:
                 },
                 config={"callbacks": [LCMessageLoggerAsync()]},
             )
         return chain_out
-    async def run(self, text: str, *, generate_effects: bool):
-        text_split = await self.split_text(text)
-        select_voice_chain_out = await self.map_characters_to_voices(
-            text_split=text_split
         )
-        # TODO: show select_voice_chain_out.character2props on UI
-        out_path = await self.audio_generator.generate_audio(
-            text_split=text_split,
-            character_to_voice=select_voice_chain_out.character2voice,
-            generate_effects=generate_effects,
         )
-        return out_path

+import asyncio
+import os
+from asyncio import TaskGroup
+from pathlib import Path
+from typing import Any, Callable, List
+from uuid import uuid4
 from langchain_community.callbacks import get_openai_callback
+from pydantic import BaseModel
+from pydub import AudioSegment
+from src import tts, utils
+from src.config import (
+    CONTEXT_CHAR_LEN_FOR_TTS,
+    ELEVENLABS_MAX_PARALLEL,
+    OPENAI_MAX_PARALLEL,
+    logger,
+)
 from src.lc_callbacks import LCMessageLoggerAsync
+from src.preprocess_tts_emotions_chain import TTSParamProcessor
+from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsAlignment, TTSTimestampsResponse
+from src.select_voice_chain import (
+    CharacterPropertiesNullable,
+    SelectVoiceChainOutput,
+    VoiceSelector,
+)
+from src.sound_effects_design import (
+    SoundEffectDescription,
+    SoundEffectsDesignOutput,
+    create_sound_effects_design_chain,
+)
+from src.text_modification_chain import modify_text_chain
 from src.text_split_chain import SplitTextOutput, create_split_text_chain
+from src.utils import GPTModels, prettify_unknown_character_label
+from src.web.constructor import HTMLGenerator
+from src.web.utils import (
+    create_status_html,
+    generate_text_split_inner_html_no_effect,
+    generate_text_split_inner_html_with_effects,
+    generate_voice_mapping_inner_html,
+)
+class TTSPhrasesGenerationOutput(BaseModel):
+    audio_fps: list[str]
+    char2time: TTSTimestampsAlignment
+class AudiobookBuilder:
+    def __init__(self, rm_artifacts: bool = False):
         self.voice_selector = VoiceSelector()
+        self.params_tts_processor = TTSParamProcessor()
+        self.rm_artifacts = rm_artifacts
+        self.min_sound_effect_duration_sec = 1
+        self.sound_effects_prompt_influence = 0.75  # seems to work nicely
+        self.html_generator = HTMLGenerator()
+        self.name = type(self).__name__
+    @staticmethod
+    async def _prepare_text_for_tts(text: str) -> str:
+        chain = modify_text_chain(llm_model=GPTModels.GPT_4o)
+        with get_openai_callback() as cb:
+            result = await chain.ainvoke(
+                {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
+            )
+        logger.info(
+            f'End of modifying text with caps and symbols(?, !, ...). Openai callback stats: {cb}'
+        )
+        return result.text_modified
+    @staticmethod
+    async def _split_text(text: str) -> SplitTextOutput:
         chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
         with get_openai_callback() as cb:
             chain_out = await chain.ainvoke(
                 {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
             )
+        logger.info(f'end of splitting text into characters. openai callback stats: {cb}')
         return chain_out
+    @staticmethod
+    async def _design_sound_effects(text: str) -> SoundEffectsDesignOutput:
+        chain = create_sound_effects_design_chain(llm_model=GPTModels.GPT_4o)
+        with get_openai_callback() as cb:
+            res = await chain.ainvoke(
+                {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
+            )
+        logger.info(
+            f'designed {len(res.sound_effects_descriptions)} sound effects. '
+            f'openai callback stats: {cb}'
+        )
+        return res
+    async def _map_characters_to_voices(
         self, text_split: SplitTextOutput
     ) -> SelectVoiceChainOutput:
+        chain = self.voice_selector.create_voice_mapping_chain(llm_model=GPTModels.GPT_4o)
         with get_openai_callback() as cb:
             chain_out = await chain.ainvoke(
                 {
                 },
                 config={"callbacks": [LCMessageLoggerAsync()]},
             )
+        logger.info(f'end of mapping characters to voices. openai callback stats: {cb}')
         return chain_out
+    async def _prepare_params_for_tts(self, text_split: SplitTextOutput) -> list[TTSParams]:
+        semaphore = asyncio.Semaphore(OPENAI_MAX_PARALLEL)
+        async def run_task_with_semaphore(func, **params):
+            async with semaphore:
+                outputs = await func(**params)
+                return outputs
+        tasks = []
+        for character_phrase in text_split.phrases:
+            tasks.append(
+                run_task_with_semaphore(
+                    func=self.params_tts_processor.run,
+                    text=character_phrase.text,
+                )
+            )
+        tts_tasks_results = await asyncio.gather(*tasks)
+        return tts_tasks_results
+    @staticmethod
+    def _add_voice_ids_to_tts_params(
+        text_split: SplitTextOutput,
+        tts_params_list: list[TTSParams],
+        character2voice: dict[str, str],
+    ) -> list[TTSParams]:
+        for character_phrase, params in zip(text_split.phrases, tts_params_list):
+            params.voice_id = character2voice[character_phrase.character]
+        return tts_params_list
+    @staticmethod
+    def _get_left_and_right_contexts_for_each_phrase(
+        phrases, context_length=CONTEXT_CHAR_LEN_FOR_TTS
+    ):
+        """
+        Return phrases from left and right sides which don't exceed `context_length`.
+        Approx. number of words/tokens based on `context_length` can be calculated by dividing it by 5.
+        """
+        # TODO: split first context phrase if it exceeds `context_length`, currently it's not added.
+        # TODO: optimize algorithm to linear time using sliding window on top of cumulative length sums.
+        left_right_contexts = []
+        for i in range(len(phrases)):
+            left_text, right_text = '', ''
+            for j in range(i - 1, -1, -1):
+                if len(left_text) + len(phrases[j].text) < context_length:
+                    left_text = phrases[j].text + left_text
+                else:
+                    break
+            for phrase in phrases[i + 1 :]:
+                if len(right_text) + len(phrase.text) < context_length:
+                    right_text += phrase.text
+                else:
+                    break
+            left_right_contexts.append((left_text, right_text))
+        return left_right_contexts
+    def _add_previous_and_next_context_to_tts_params(
+        self,
+        text_split: SplitTextOutput,
+        tts_params_list: list[TTSParams],
+    ) -> list[TTSParams]:
+        left_right_contexts = self._get_left_and_right_contexts_for_each_phrase(text_split.phrases)
+        for cur_contexts, params in zip(left_right_contexts, tts_params_list):
+            left_context, right_context = cur_contexts
+            params.previous_text = left_context
+            params.next_text = right_context
+        return tts_params_list
+    @staticmethod
+    async def _generate_tts_audio(
+        tts_params_list: list[TTSParams],
+        out_dp: str,
+    ) -> TTSPhrasesGenerationOutput:
+        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
+        async def _tts_with_semaphore(params: TTSParams) -> TTSTimestampsResponse:
+            async with semaphore:
+                return await tts.tts_w_timestamps(params=params)
+        tasks = [_tts_with_semaphore(params=params) for params in tts_params_list]
+        tts_responses: list[TTSTimestampsResponse] = await asyncio.gather(*tasks)
+        tts_audio_fps = []
+        for ix, (params, res) in enumerate(zip(tts_params_list, tts_responses), start=1):
+            out_fp_no_ext = os.path.join(out_dp, f'tts_output_{ix}')
+            out_fp = res.write_audio_to_file(
+                filepath_no_ext=out_fp_no_ext, audio_format=params.output_format
+            )
+            tts_audio_fps.append(out_fp)
+        # combine alignments
+        alignments = [response.alignment for response in tts_responses]
+        char2time = TTSTimestampsAlignment.combine_alignments(alignments=alignments)
+        # filter alignments
+        char2time = char2time.filter_chars_without_duration()
+        return TTSPhrasesGenerationOutput(audio_fps=tts_audio_fps, char2time=char2time)
+    def _update_sound_effects_descriptions_with_durations(
+        self,
+        sound_effects_descriptions: list[SoundEffectDescription],
+        char2time: TTSTimestampsAlignment,
+    ) -> list[SoundEffectDescription]:
+        for sed in sound_effects_descriptions:
+            ix_start, ix_end = sed.ix_start_orig_text, sed.ix_end_orig_text
+            time_start = char2time.get_start_time_by_char_ix(ix_start, safe=True)
+            time_end = char2time.get_end_time_by_char_ix(ix_end, safe=True)
+            duration = time_end - time_start
+            # apply min effect duration
+            duration = max(self.min_sound_effect_duration_sec, duration)
+            # update inplace
+            sed.start_sec = time_start
+            sed.duration_sec = duration
+        return sound_effects_descriptions
+    # def _filter_short_sound_effects(
+    #     self,
+    #     sound_effects_descriptions: list[SoundEffectDescription],
+    # ) -> list[SoundEffectDescription]:
+    #     filtered = [
+    #         sed
+    #         for sed in sound_effects_descriptions
+    #         if sed.duration_sec > self.min_sound_effect_duration_sec
+    #     ]
+    #     len_orig = len(sound_effects_descriptions)
+    #     len_new = len(filtered)
+    #     logger.info(
+    #         f'{len_new} out of {len_orig} original sound effects are kept '
+    #         f'after filtering by min duration: {self.min_sound_effect_duration_sec}'
+    #     )
+    #     return filtered
+    def _sound_effects_description_2_generation_params(
+        self,
+        sound_effects_descriptions: list[SoundEffectDescription],
+    ) -> list[SoundEffectsParams]:
+        params = [
+            SoundEffectsParams(
+                text=sed.prompt,
+                duration_seconds=sed.duration_sec,
+                prompt_influence=self.sound_effects_prompt_influence,
+            )
+            for sed in sound_effects_descriptions
+        ]
+        return params
+    @staticmethod
+    async def _generate_sound_effects(
+        sound_effects_params: list[SoundEffectsParams],
+        out_dp: str,
+    ) -> list[str]:
+        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
+        async def _se_gen_with_semaphore(params: SoundEffectsParams) -> list[bytes]:
+            async with semaphore:
+                return await tts.sound_generation_consumed(params=params)
+        tasks = [_se_gen_with_semaphore(params=params) for params in sound_effects_params]
+        results = await asyncio.gather(*tasks)
+        se_fps = []
+        for ix, task_res in enumerate(results, start=1):
+            out_fp = os.path.join(out_dp, f'sound_effect_{ix}.wav')
+            utils.write_chunked_bytes(data=task_res, fp=out_fp)
+            se_fps.append(out_fp)
+        return se_fps
+    @staticmethod
+    def _save_text_split_debug_data(
+        text_split: SplitTextOutput,
+        out_dp: str,
+    ):
+        out_fp = os.path.join(out_dp, 'text_split.json')
+        # NOTE: use `to_dict()` for correct conversion
+        data = text_split.model_dump()
+        utils.write_json(data, fp=out_fp)
+    @staticmethod
+    def _save_tts_debug_data(
+        tts_params_list: list[TTSParams],
+        tts_out: TTSPhrasesGenerationOutput,
+        out_dp: str,
+    ):
+        out_fp = os.path.join(out_dp, 'tts.json')
+        # NOTE: use `to_dict()` for correct conversion
+        data = [param.to_dict() for param in tts_params_list]
+        utils.write_json(data, fp=out_fp)
+        out_dp = os.path.join(out_dp, 'tts_char2time.csv')
+        df_char2time = tts_out.char2time.to_dataframe()
+        df_char2time.to_csv(out_dp, index=True)
+    @staticmethod
+    def _save_sound_effects_debug_data(
+        sound_effect_design_output: SoundEffectsDesignOutput,
+        sound_effect_descriptions: list[SoundEffectDescription],
+        out_dp: str,
+    ):
+        out_fp = os.path.join(out_dp, 'sound_effects_raw_llm_output.txt')
+        utils.write_txt(sound_effect_design_output.text_annotated, fp=out_fp)
+        out_fp = os.path.join(out_dp, 'sound_effects_descriptions.json')
+        data = [sed.model_dump() for sed in sound_effect_descriptions]
+        utils.write_json(data, fp=out_fp)
+    @staticmethod
+    def _postprocess_tts_audio(audio_fps: list[str], out_dp: str, target_dBFS: float) -> list[str]:
+        fps = []
+        for in_fp in audio_fps:
+            audio_segment = AudioSegment.from_file(in_fp)
+            normalized_audio = utils.normalize_audio(audio_segment, target_dBFS)
+            out_fp = os.path.join(out_dp, f"{Path(in_fp).stem}.normalized.wav")
+            normalized_audio.export(out_fp, format="wav")
+            fps.append(out_fp)
+        return fps
+    @staticmethod
+    def _postprocess_sound_effects(
+        audio_fps: list[str], out_dp: str, target_dBFS: float, fade_ms: int
+    ) -> list[str]:
+        fps = []
+        for in_fp in audio_fps:
+            audio_segment = AudioSegment.from_file(in_fp)
+            processed = utils.normalize_audio(audio_segment, target_dBFS)
+            processed = processed.fade_in(duration=fade_ms)
+            processed = processed.fade_out(duration=fade_ms)
+            out_fp = os.path.join(out_dp, f"{Path(in_fp).stem}.postprocessed.wav")
+            processed.export(out_fp, format="wav")
+            fps.append(out_fp)
+        return fps
+    @staticmethod
+    def _concatenate_audiofiles(audio_fps: list[str], out_wav_fp: str):
+        concat = AudioSegment.from_file(audio_fps[0])
+        for filename in audio_fps[1:]:
+            next_audio = AudioSegment.from_file(filename)
+            concat += next_audio
+        logger.info(f'saving concatenated audiobook to: "{out_wav_fp}"')
+        concat.export(out_wav_fp, format="wav")
+    def _get_text_split_html(
+        self,
+        text_split: SplitTextOutput,
+        sound_effects_descriptions: list[SoundEffectDescription] | None,
+    ):
+        # modify copies of original phrases, keep original intact
+        character_phrases = [p.model_copy(deep=True) for p in text_split.phrases]
+        for phrase in character_phrases:
+            phrase.character = prettify_unknown_character_label(phrase.character)
+        if not sound_effects_descriptions:
+            inner = generate_text_split_inner_html_no_effect(character_phrases=character_phrases)
+        else:
+            inner = generate_text_split_inner_html_with_effects(
+                character_phrases=character_phrases,
+                sound_effects_descriptions=sound_effects_descriptions,
+            )
+        final = self.html_generator.generate_text_split(inner)
+        return final
+    def _get_voice_mapping_html(
+        self, use_user_voice: bool, select_voice_chain_out: SelectVoiceChainOutput
+    ):
+        if use_user_voice:
+            return ''
+        inner = generate_voice_mapping_inner_html(select_voice_chain_out)
+        final = self.html_generator.generate_voice_assignments(inner)
+        return final
+    STAGE_1 = 'Text Analysis'
+    STAGE_2 = 'Voices Selection'
+    STAGE_3 = 'Audio Generation'
+    def _get_yield_data_stage_0(self):
+        status = self.html_generator.generate_status("Starting", [("Analyzing Text...", False)])
+        return None, "", status
+    def _get_yield_data_stage_1(self, text_split_html: str):
+        status_html = create_status_html(
+            "Text Analysis Complete",
+            [(self.STAGE_1, True), ("Selecting Voices...", False)],
+        )
+        html = status_html + text_split_html
+        return None, "", html
+    def _get_yield_data_stage_2(self, text_split_html: str, voice_mapping_html: str):
+        status_html = create_status_html(
+            "Voice Selection Complete",
+            [(self.STAGE_1, True), (self.STAGE_2, True), ("Generating Audio...", False)],
+        )
+        html = status_html + text_split_html + voice_mapping_html + '</div>'
+        return None, "", html
+    def _get_yield_data_stage_3(
+        self, final_audio_fp: str, text_split_html: str, voice_mapping_html: str
+    ):
+        status_html = create_status_html(
+            "Audiobook is ready ✨",
+            [(self.STAGE_1, True), (self.STAGE_2, True), (self.STAGE_3, True)],
         )
+        third_stage_result_html = (
+            status_html
+            + text_split_html
+            + voice_mapping_html
+            + self.html_generator.generate_final_message()
+            + '</div>'
         )
+        return final_audio_fp, "", third_stage_result_html
+    async def run(
+        self,
+        text: str,
+        generate_effects: bool,
+        use_user_voice: bool = False,
+        voice_id: str | None = None,
+    ):
+        now_str = utils.get_utc_now_str()
+        uuid_trimmed = str(uuid4()).split('-')[0]
+        dir_name = f'{now_str}-{uuid_trimmed}'
+        out_dp_root = os.path.join('data', 'audiobooks', dir_name)
+        os.makedirs(out_dp_root, exist_ok=False)
+        debug_dp = os.path.join(out_dp_root, 'debug')
+        os.makedirs(debug_dp)
+        # TODO: currently, we are constantly writing and reading audio segments from files.
+        # I think it will be more efficient to keep all audio in memory.
+        # zero stage
+        if use_user_voice and not voice_id:
+            yield None, "", self.html_generator.generate_message_without_voice_id()
+        else:
+            yield self._get_yield_data_stage_0()
+            text_for_tts = await self._prepare_text_for_tts(text=text)
+            # TODO: call sound effects chain in parallel with text split chain
+            text_split = await self._split_text(text=text_for_tts)
+            self._save_text_split_debug_data(text_split=text_split, out_dp=debug_dp)
+            # yield stage 1
+            text_split_html = self._get_text_split_html(
+                text_split=text_split, sound_effects_descriptions=None
+            )
+            yield self._get_yield_data_stage_1(text_split_html=text_split_html)
+            if generate_effects:
+                se_design_output = await self._design_sound_effects(text=text_for_tts)
+                se_descriptions = se_design_output.sound_effects_descriptions
+                text_split_html = self._get_text_split_html(
+                    text_split=text_split, sound_effects_descriptions=se_descriptions
+                )
+            # TODO: run voice mapping and tts params selection in parallel
+            if not use_user_voice:
+                select_voice_chain_out = await self._map_characters_to_voices(text_split=text_split)
+            else:
+                if voice_id is None:
+                    raise ValueError(f'voice_id is None')
+                select_voice_chain_out = SelectVoiceChainOutput(
+                    character2props={
+                        char: CharacterPropertiesNullable(gender=None, age_group=None)
+                        for char in text_split.characters
+                    },
+                    character2voice={char: voice_id for char in text_split.characters},
+                )
+            tts_params_list = await self._prepare_params_for_tts(text_split=text_split)
+            # yield stage 2
+            voice_mapping_html = self._get_voice_mapping_html(
+                use_user_voice=use_user_voice, select_voice_chain_out=select_voice_chain_out
+            )
+            yield self._get_yield_data_stage_2(
+                text_split_html=text_split_html, voice_mapping_html=voice_mapping_html
+            )
+            tts_params_list = self._add_voice_ids_to_tts_params(
+                text_split=text_split,
+                tts_params_list=tts_params_list,
+                character2voice=select_voice_chain_out.character2voice,
+            )
+            tts_params_list = self._add_previous_and_next_context_to_tts_params(
+                text_split=text_split,
+                tts_params_list=tts_params_list,
+            )
+            tts_dp = os.path.join(out_dp_root, 'tts')
+            os.makedirs(tts_dp)
+            tts_out = await self._generate_tts_audio(tts_params_list=tts_params_list, out_dp=tts_dp)
+            self._save_tts_debug_data(
+                tts_params_list=tts_params_list, tts_out=tts_out, out_dp=debug_dp
+            )
+            if generate_effects:
+                se_descriptions = self._update_sound_effects_descriptions_with_durations(
+                    sound_effects_descriptions=se_descriptions, char2time=tts_out.char2time
+                )
+                # no need in filtering, since we ensure the min duration above
+                # se_descriptions = self._filter_short_sound_effects(
+                #     sound_effects_descriptions=se_descriptions
+                # )
+                se_params = self._sound_effects_description_2_generation_params(
+                    sound_effects_descriptions=se_descriptions
+                )
+                if len(se_descriptions) != len(se_params):
+                    raise ValueError(
+                        f'expected {len(se_descriptions)} sound effects params, got: {len(se_params)}'
+                    )
+                effects_dp = os.path.join(out_dp_root, 'sound_effects')
+                os.makedirs(effects_dp)
+                se_fps = await self._generate_sound_effects(
+                    sound_effects_params=se_params, out_dp=effects_dp
+                )
+                if len(se_descriptions) != len(se_fps):
+                    raise ValueError(
+                        f'expected {len(se_descriptions)} generated sound effects, got: {len(se_fps)}'
+                    )
+                self._save_sound_effects_debug_data(
+                    sound_effect_design_output=se_design_output,
+                    sound_effect_descriptions=se_descriptions,
+                    out_dp=debug_dp,
+                )
+            tts_normalized_dp = os.path.join(out_dp_root, 'tts_normalized')
+            os.makedirs(tts_normalized_dp)
+            tts_norm_fps = self._postprocess_tts_audio(
+                audio_fps=tts_out.audio_fps,
+                out_dp=tts_normalized_dp,
+                target_dBFS=-20,
+            )
+            if generate_effects:
+                se_normalized_dp = os.path.join(out_dp_root, 'sound_effects_postprocessed')
+                os.makedirs(se_normalized_dp)
+                se_norm_fps = self._postprocess_sound_effects(
+                    audio_fps=se_fps,
+                    out_dp=se_normalized_dp,
+                    target_dBFS=-27,
+                    fade_ms=500,
+                )
+            tts_concat_fp = os.path.join(out_dp_root, f'audiobook_{now_str}.wav')
+            self._concatenate_audiofiles(audio_fps=tts_norm_fps, out_wav_fp=tts_concat_fp)
+            if not generate_effects:
+                final_audio_fp = tts_concat_fp
+            else:
+                tts_concat_with_effects_fp = os.path.join(
+                    out_dp_root, f'audiobook_with_effects_{now_str}.wav'
+                )
+                se_starts_sec = [sed.start_sec for sed in se_descriptions]
+                utils.overlay_multiple_audio(
+                    main_audio_fp=tts_concat_fp,
+                    audios_to_overlay_fps=se_norm_fps,
+                    starts_sec=se_starts_sec,
+                    out_fp=tts_concat_with_effects_fp,
+                )
+                final_audio_fp = tts_concat_with_effects_fp
+            utils.rm_dir_conditional(dp=out_dp_root, to_remove=self.rm_artifacts)
+            # yield stage 3
+            yield self._get_yield_data_stage_3(
+                final_audio_fp=final_audio_fp,
+                text_split_html=text_split_html,
+                voice_mapping_html=voice_mapping_html,
+            )
+        logger.info(f'end of {self.name}.run()')

src/config.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import os
 import logging
 logging.basicConfig(
     level=logging.INFO,
@@ -12,8 +12,11 @@ ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
 FILE_SIZE_MAX = 0.5  # in mb
-OPENAI_MAX_PARALLEL = 8  # empirically set
-ELEVENLABS_MAX_PARALLEL = 15  # current limitation of available subscription
 # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
 VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
@@ -29,8 +32,15 @@ All you need to do - is to input the book text or select it from the provided Sa
 AI will do the rest:
 - split text into characters
-- assign each character a voice
 - preprocess text to better convey emotions during Text-to-Speech
 - (optionally) add sound effects to create immersive atmosphere
 - generate audiobook using Text-to-Speech model
 """

 import logging
+import os
 logging.basicConfig(
     level=logging.INFO,
 FILE_SIZE_MAX = 0.5  # in mb
+OPENAI_MAX_PARALLEL = 10  # empirically set
+# current limitation of available subscription.
+# see: https://elevenlabs.io/docs/api-reference/text-to-speech#generation-and-concurrency-limits
+ELEVENLABS_MAX_PARALLEL = 15
 # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
 VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
 AI will do the rest:
 - split text into characters
+- select voice for each character
 - preprocess text to better convey emotions during Text-to-Speech
 - (optionally) add sound effects to create immersive atmosphere
 - generate audiobook using Text-to-Speech model
 """
+DEFAULT_TTS_STABILITY = 0.5
+DEFAULT_TTS_STABILITY_ACCEPTABLE_RANGE = (0.3, 0.8)
+DEFAULT_TTS_SIMILARITY_BOOST = 0.5
+DEFAULT_TTS_STYLE = 0.0
+CONTEXT_CHAR_LEN_FOR_TTS = 500

src/emotions/generation.py DELETED Viewed

@@ -1,208 +0,0 @@
-import json
-import typing as t
-from abc import ABC, abstractmethod
-import openai
-from pydantic import BaseModel
-from requests import HTTPError
-from src.config import OPENAI_API_KEY, logger
-from src.utils import auto_retry
-from .prompts import (
-    SOUND_EFFECT_GENERATION,
-    SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION,
-    TEXT_MODIFICATION,
-    TEXT_MODIFICATION_WITH_SSML,
-)
-from .utils import get_audio_duration
-class TextPreparationForTTSTaskOutput(BaseModel):
-    task: str
-    output: t.Any
-class AbstractEffectGenerator(ABC):
-    @abstractmethod
-    async def generate_text_for_sound_effect(self, text) -> dict:
-        pass
-    @abstractmethod
-    async def generate_parameters_for_sound_effect(
-        self, text: str, generated_audio_file: str | None
-    ) -> TextPreparationForTTSTaskOutput:
-        pass
-    @abstractmethod
-    async def add_emotion_to_text(self, text: str) -> TextPreparationForTTSTaskOutput:
-        pass
-# class EffectGenerator(AbstractEffectGenerator):
-#     def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
-#         self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
-#         self.sound_effect_prompt = (
-#             SOUND_EFFECT_GENERATION
-#             if predict_duration
-#             else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
-#         )
-#         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
-#         self.model_type = model_type
-#         logger.info(
-#             f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
-#         )
-#     @auto_retry
-#     def generate_text_for_sound_effect(self, text: str) -> dict:
-#         """Generate sound effect description and parameters based on input text."""
-#         try:
-#             completion = self.client.chat.completions.create(
-#                 model=self.model_type,
-#                 messages=[
-#                     {"role": "system", "content": self.sound_effect_prompt},
-#                     {"role": "user", "content": text},
-#                 ],
-#                 response_format={"type": "json_object"},
-#             )
-#             # Extracting the output
-#             chatgpt_output = completion.choices[0].message.content
-#             # Parse and return JSON response
-#             output_dict = json.loads(chatgpt_output)
-#             logger.info(
-#                 "Successfully generated sound effect description: %s", output_dict
-#             )
-#             return output_dict
-#         except json.JSONDecodeError as e:
-#             logger.error("Failed to parse the output text as JSON: %s", e)
-#             raise RuntimeError(
-#                 f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
-#             )
-#         except HTTPError as e:
-#             logger.error("HTTP error occurred: %s", e)
-#             raise RuntimeError(f"HTTP Error: {e}")
-#         except Exception as e:
-#             logger.error("Unexpected error occurred: %s", e)
-#             raise RuntimeError(f"Unexpected Error: {e}")
-#     @auto_retry
-#     def generate_parameters_for_sound_effect(
-#         self, text: str, generated_audio_file: str = None
-#     ) -> dict:
-#         llm_output = self.generate_text_for_sound_effect(text)
-#         if generated_audio_file is not None:
-#             llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
-#             logger.info(
-#                 "Added duration_seconds to output based on generated audio file: %s",
-#                 generated_audio_file,
-#             )
-#         return llm_output
-#     @auto_retry
-#     def add_emotion_to_text(self, text: str) -> dict:
-#         completion = self.client.chat.completions.create(
-#             model=self.model_type,
-#             messages=[
-#                 {"role": "system", "content": self.text_modification_prompt},
-#                 {"role": "user", "content": text},
-#             ],
-#             response_format={"type": "json_object"},
-#         )
-#         chatgpt_output = completion.choices[0].message.content
-#         try:
-#             output_dict = json.loads(chatgpt_output)
-#             logger.info(
-#                 "Successfully modified text with emotional cues: %s", output_dict
-#             )
-#             return output_dict
-#         except json.JSONDecodeError as e:
-#             logger.error("Error in parsing the modified text: %s", e)
-#             raise f"error, output_text: {chatgpt_output}"
-class EffectGeneratorAsync(AbstractEffectGenerator):
-    def __init__(self, predict_duration: bool, model_type: str = "gpt-4o"):
-        self.client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
-        self.sound_effect_prompt = (
-            SOUND_EFFECT_GENERATION
-            if predict_duration
-            else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
-        )
-        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
-        self.model_type = model_type
-    @auto_retry
-    async def generate_text_for_sound_effect(self, text: str) -> dict:
-        """Asynchronous version to generate sound effect description."""
-        try:
-            completion = await self.client.chat.completions.create(
-                model=self.model_type,
-                messages=[
-                    {"role": "system", "content": self.sound_effect_prompt},
-                    {"role": "user", "content": text},
-                ],
-                response_format={"type": "json_object"},
-            )
-            # Extracting the output
-            chatgpt_output = completion.choices[0].message.content
-            # Parse and return JSON response
-            output_dict = json.loads(chatgpt_output)
-            logger.info(
-                "Successfully generated sound effect description: %s", output_dict
-            )
-            return output_dict
-        except json.JSONDecodeError as e:
-            logger.error("Failed to parse the output text as JSON: %s", e)
-            raise RuntimeError(
-                f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
-            )
-        except HTTPError as e:
-            logger.error("HTTP error occurred: %s", e)
-            raise RuntimeError(f"HTTP Error: {e}")
-        except Exception as e:
-            logger.error("Unexpected error occurred: %s", e)
-            raise RuntimeError(f"Unexpected Error: {e}")
-    @auto_retry
-    async def generate_parameters_for_sound_effect(
-        self, text: str, generated_audio_file: str | None = None
-    ) -> TextPreparationForTTSTaskOutput:
-        llm_output = await self.generate_text_for_sound_effect(text)
-        if generated_audio_file is not None:
-            llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
-            logger.info(
-                "Added duration_seconds to output based on generated audio file: %s",
-                generated_audio_file,
-            )
-        return TextPreparationForTTSTaskOutput(task="add_effects", output=llm_output)
-    @auto_retry
-    async def add_emotion_to_text(self, text: str) -> TextPreparationForTTSTaskOutput:
-        completion = await self.client.chat.completions.create(
-            model=self.model_type,
-            messages=[
-                {"role": "system", "content": self.text_modification_prompt},
-                {"role": "user", "content": text},
-            ],
-            response_format={"type": "json_object"},
-        )
-        chatgpt_output = completion.choices[0].message.content
-        try:
-            output_dict = json.loads(chatgpt_output)
-            logger.info(
-                "Successfully modified text with emotional cues: %s", output_dict
-            )
-            return TextPreparationForTTSTaskOutput(
-                task="add_emotion", output=output_dict
-            )
-        except json.JSONDecodeError as e:
-            logger.error("Error in parsing the modified text: %s", e)
-            raise f"error, output_text: {chatgpt_output}"

src/emotions/prompts.py DELETED Viewed

@@ -1,160 +0,0 @@
-PREFIX = """\
-You should help me to make an audiobook with realistic emotion sound using TTS.
-You are tasked with generating a description of sound effects
-that matches the atmosphere, actions, and tone of a given sentence or text from a book.
-The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
-The generated sound description must evoke the scene
-or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
-and it should be succinct and fit the mood of the text."""
-SOUND_EFFECT_GENERATION = f"""
-{PREFIX}
-Additionally, you should include the following parameters in your response:
-    Text: A generated description of the sound that matches the text provided.
-        Keep the description simple and effective to capture the soundscape.
-        This text will be converted into a sound effect.
-    Duration_seconds: The appropriate duration of the sound effect,
-        which should be calculated based on the length and nature of the scene.
-        Cap this duration at 22 seconds. But be carefully, for very long text in input make a long sound effect,
-         for small make a small one. And the duration should be similar to duration of input text
-    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
-        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
-        use a value around 0.3. For more specific or detailed sound scenes
-        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
-Your output should be in the following JSON format:
-{{
-  "text": "A soft breeze rustling through leaves, distant birds chirping.",
-  "duration_seconds": 4.0,
-  "prompt_influence": 0.4
-}}
-NOTES:
-- NEVER add any speech or voices in your instructions!
-- NEVER add any music in your instructions!
-- NEVER add city sounds, car honks in your instructions!
-- make your text descriptions VERY SPECIFIC, AVOID vague instructions.
-If it's necessary, you can use couple sentences to formulate the instruction.
-But remember to use keep instructions simple.
-- aim to create specific sounds, like crackling fireplace, footsteps, wind, etc...
-"""
-SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION = f"""
-{PREFIX}
-Additionally, you should include the following parameters in your response:
-    Text: A generated description of the sound that matches the text provided.
-        Keep the description simple and effective to capture the soundscape.
-        This text will be converted into a sound effect.
-    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
-        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
-        use a value around 0.3. For more specific or detailed sound scenes
-        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
-Your output should be in the following JSON format:
-{{
-  "text": "A soft breeze rustling through leaves, distant birds chirping.",
-  "prompt_influence": 0.4
-}}"""
-TEXT_MODIFICATION = """
-You should help me to make an audiobook with realistic emotion-based voice using TTS.
-You are tasked with adjusting the emotional tone of a given text
-by modifying the text with special characters such as "!", "...", "-", "~",
-and uppercase words to add emphasis or convey emotion. For adding more emotion u can
-duplicate special characters for example "!!!".
-Do not remove or add any different words.
-Only alter the presentation of the existing words.
-Also you can add pause in the output text if it needed
-The most consistent way is programmatically using the syntax <break time="1.5s" />. or any time in second if it fit to the text
-This will create an exact and natural pause in the speech.
-It is not just added silence between words,
-but the AI has an actual understanding of this syntax and will add a natural pause.
-After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
-according to the level of emotional intensity in the modified text.
-Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
- Your output should be in the following JSON format:
- {
-  "modified_text": "Modified text with emotional adjustments.",
-  "params": {
-    "stability": 0.7,
-    "similarity_boost": 0.5,
-    "style": 0.3
-  }
-}
-The "stability" parameter should range from 0 to 1,
-with lower values indicating a more expressive, less stable voice.
-The "similarity_boost" parameter should also range from 0 to 1,
-with higher values indicating more emphasis on the voice similarity.
-The "style" parameter should also range from 0 to 1,
-where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
-Adjust both according to the emotional intensity of the text.
-Example of text that could be passed:
-Text: "I can't believe this is happening."
-"""
-TEXT_MODIFICATION_WITH_SSML = """
-You should help me to make an audiobook with overabundant emotion-based voice using TTS.
-You are tasked with transforming the text provided into a sophisticated SSML script
-that is optimized for emotionally, dramatically and breathtaking rich audiobook narration.
-Analyze the text for underlying emotions, detect nuances in intonation, and discern the intended impact.
-Apply suitable SSML enhancements to ensure that the final TTS output delivers
-a powerful, engaging, dramatic and breathtaking listening experience appropriate for an audiobook context
-(more effects/emotions are better than less)."
-Please, use only provided SSML tags and don't generate any other tags.
-Key SSML Tags to Utilize:
-<speak>: This is the root element. All SSML content to be synthesized must be enclosed within this tag.
-<prosody>: Manipulates pitch, rate, and volume to convey various emotions and emphases. Use this tag to adjust the voice to match the mood and tone of different parts of the narrative.
-<break>: Inserts pauses of specified durations. Use this to create natural breaks in speech, aiding in dramatic effect and better comprehension for listeners.
-<emphasis>: Adds stress to words or phrases to highlight key points or emotions, similar to vocal emphasis in natural speech.
-<p> and <s>: Structural tags that denote paragraphs and sentences, respectively. They help to manage the flow and pacing of the narrative appropriately.
-Input Text Example: "He stood there, gazing into the endless horizon. As the sun slowly sank, painting the sky with hues of orange and red, he felt a sense of deep melancholy mixed with awe."
-Modified text should be in the XML format. Expected SSML-enriched Output:
-<speak>
-    <p>
-        <s>
-            He stood there, <prosody rate="slow" volume="soft">gazing into the endless horizon.</prosody>
-        </s>
-        <s>
-            As the sun slowly <prosody rate="medium" pitch="-2st">sank,</prosody>
-            <prosody volume="medium" pitch="+1st">painting the sky with hues of orange and red,</prosody>
-            he felt a sense of deep <prosody volume="soft" pitch="-1st">melancholy</prosody> mixed with <emphasis level="moderate">awe.</emphasis>
-        </s>
-    </p>
-</speak>
-After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
-according to the level of emotional intensity in the modified text.
-Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
-Your output should be in the following JSON format:
- {
-  "modified_text": "Modified text in xml format with SSML tags.",
-  "params": {
-    "stability": 0.7,
-    "similarity_boost": 0.5,
-    "style": 0.3
-  }
-}
-The "stability" parameter should range from 0 to 1,
-with lower values indicating a more expressive, less stable voice.
-The "similarity_boost" parameter should also range from 0 to 1,
-with higher values indicating more emphasis on the voice similarity.
-The "style" parameter should also range from 0 to 1,
-where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
-Adjust both according to the emotional intensity of the text.
-"""

src/emotions/utils.py DELETED Viewed

@@ -1,75 +0,0 @@
-from pydub import AudioSegment
-from pathlib import Path
-from elevenlabs import ElevenLabs, AsyncElevenLabs
-from elevenlabs import play, save
-from src.config import logger
-def get_audio_duration(filepath: str) -> float:
-    """
-    Returns the duration of the audio file in seconds.
-    :param filepath: Path to the audio file.
-    :return: Duration of the audio file in seconds.
-    """
-    audio = AudioSegment.from_file(filepath)
-    duration_in_seconds = len(audio) / 1000  # Convert milliseconds to seconds
-    return round(duration_in_seconds, 1)
-def add_overlay_for_audio(
-    main_audio_filename: str,
-    sound_effect_filename: str,
-    output_filename: str = None,
-    cycling_effect: bool = True,
-    decrease_effect_volume: int = 0,
-) -> str:
-    try:
-        main_audio = AudioSegment.from_file(main_audio_filename)
-        effect_audio = AudioSegment.from_file(sound_effect_filename)
-    except Exception as e:
-        raise RuntimeError(f"Error loading audio files: {e}")
-    if cycling_effect:
-        while len(effect_audio) < len(main_audio):
-            effect_audio += effect_audio
-    effect_audio = effect_audio[: len(main_audio)]
-    if decrease_effect_volume > 0:
-        effect_audio = effect_audio - decrease_effect_volume
-    combined_audio = main_audio.overlay(effect_audio)
-    if output_filename is None:
-        output_filename = (
-            f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
-        )
-    combined_audio.export(output_filename, format="wav")
-    return output_filename
-def sound_generation(sound_generation_data: dict, output_file: str):
-    client = ElevenLabs(
-        api_key="YOUR_API_KEY",
-    )
-    audio = client.text_to_sound_effects.convert(
-        text=sound_generation_data["text"],
-        duration_seconds=sound_generation_data["duration_seconds"],
-        prompt_influence=sound_generation_data["prompt_influence"],
-    )
-    save(audio, output_file)
-    logger.error("Successfully generated sound effect to file: %s", output_file)
-async def sound_generation_async(sound_generation_data: dict, output_file: str):
-    client = AsyncElevenLabs(
-        api_key="YOUR_API_KEY",
-    )
-    audio = await client.text_to_sound_effects.convert(
-        text=sound_generation_data["text"],
-        duration_seconds=sound_generation_data["duration_seconds"],
-        prompt_influence=sound_generation_data["prompt_influence"],
-    )
-    save(audio, output_file)
-    logger.error("Successfully generated sound effect to file: %s", output_file)

src/generate_emotional_voice.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from openai import OpenAI
 import json
 import requests
-client = OpenAI(api_key = '')
 PROMT = """
 You should help me to make an audiobook with realistic emotion-based voice using TTS.
 You are tasked with adjusting the emotional tone of a given text
@@ -45,12 +46,12 @@ He sat down on the couch, his hand tracing the empty space beside him where she
 He knew he would never see her smile again, never hear her voice and that was unbearable. Yet, he couldn’t reconcile himself with the fact that she was truly gone. 'How do I go on?' — he wondered, but there was no answer.
 """
 def generate_modified_text(text: str) -> dict:
     completion = client.chat.completions.create(
         model="gpt-4o",
-        messages=[{"role": "system", "content": PROMT},
-                  {"role": "user", "content": text}],
-        response_format={"type": "json_object"}
     )
     chatgpt_output = completion.choices[0].message.content
     try:
@@ -64,17 +65,9 @@ def generate_audio(text: str, params: dict, output_file: str):
     CHUNK_SIZE = 1024
     url = "https://api.elevenlabs.io/v1/text-to-speech/pMsXgVXv3BLzUgSXRplE"
-    headers = {
-        "Accept": "audio/mpeg",
-        "Content-Type": "application/json",
-        "xi-api-key": ""
-    }
-    data = {
-        "text": text,
-        "model_id": "eleven_monolingual_v1",
-        "voice_settings": params
-    }
     response = requests.post(url, json=data, headers=headers)
     with open(f'{output_file}.mp3', 'wb') as f:
@@ -82,13 +75,14 @@ def generate_audio(text: str, params: dict, output_file: str):
             if chunk:
                 f.write(chunk)
 if __name__ == "__main__":
-    default_param = {
-        "stability": 0.5,
-        "similarity_boost": 0.5,
-        "style": 0.5
-    }
     generate_audio(text_to_modified, default_param, "text_without_prompt")
     modified_text_with_params = generate_modified_text(text_to_modified)
     print(modified_text_with_params)
-    generate_audio(modified_text_with_params['modified_text'], modified_text_with_params['params'], "text_with_prompt")

 import json
 import requests
+from openai import OpenAI
+client = OpenAI(api_key='')
 PROMT = """
 You should help me to make an audiobook with realistic emotion-based voice using TTS.
 You are tasked with adjusting the emotional tone of a given text
 He knew he would never see her smile again, never hear her voice and that was unbearable. Yet, he couldn’t reconcile himself with the fact that she was truly gone. 'How do I go on?' — he wondered, but there was no answer.
 """
 def generate_modified_text(text: str) -> dict:
     completion = client.chat.completions.create(
         model="gpt-4o",
+        messages=[{"role": "system", "content": PROMT}, {"role": "user", "content": text}],
+        response_format={"type": "json_object"},
     )
     chatgpt_output = completion.choices[0].message.content
     try:
     CHUNK_SIZE = 1024
     url = "https://api.elevenlabs.io/v1/text-to-speech/pMsXgVXv3BLzUgSXRplE"
+    headers = {"Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": ""}
+    data = {"text": text, "model_id": "eleven_monolingual_v1", "voice_settings": params}
     response = requests.post(url, json=data, headers=headers)
     with open(f'{output_file}.mp3', 'wb') as f:
             if chunk:
                 f.write(chunk)
 if __name__ == "__main__":
+    default_param = {"stability": 0.5, "similarity_boost": 0.5, "style": 0.5}
     generate_audio(text_to_modified, default_param, "text_without_prompt")
     modified_text_with_params = generate_modified_text(text_to_modified)
     print(modified_text_with_params)
+    generate_audio(
+        modified_text_with_params['modified_text'],
+        modified_text_with_params['params'],
+        "text_with_prompt",
+    )

src/lc_callbacks.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import typing as t
 from langchain_core.callbacks import AsyncCallbackHandler
 from langchain_core.outputs import ChatGeneration
 from langchain_core.outputs.llm_result import LLMResult
-from langchain_core.messages import BaseMessage
 from src.config import logger
@@ -45,13 +45,9 @@ class LCMessageLoggerAsync(AsyncCallbackHandler):
         """Run when LLM ends running."""
         generations = response.generations
         if len(generations) != 1:
-            raise ValueError(
-                f'expected "generations" to have len 1, got: {len(generations)}'
-            )
         if len(generations[0]) != 1:
-            raise ValueError(
-                f'expected "generations[0]" to have len 1, got: {len(generations[0])}'
-            )
         if self._log_raw_llm_response is True:
             gen: ChatGeneration = generations[0][0]

 import typing as t
 from langchain_core.callbacks import AsyncCallbackHandler
+from langchain_core.messages import BaseMessage
 from langchain_core.outputs import ChatGeneration
 from langchain_core.outputs.llm_result import LLMResult
 from src.config import logger
         """Run when LLM ends running."""
         generations = response.generations
         if len(generations) != 1:
+            raise ValueError(f'expected "generations" to have len 1, got: {len(generations)}')
         if len(generations[0]) != 1:
+            raise ValueError(f'expected "generations[0]" to have len 1, got: {len(generations[0])}')
         if self._log_raw_llm_response is True:
             gen: ChatGeneration = generations[0][0]

src/preprocess_tts_emotions_chain.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import json
+import openai
+from elevenlabs import VoiceSettings
+from src.config import (
+    DEFAULT_TTS_SIMILARITY_BOOST,
+    DEFAULT_TTS_STABILITY,
+    DEFAULT_TTS_STABILITY_ACCEPTABLE_RANGE,
+    DEFAULT_TTS_STYLE,
+    OPENAI_API_KEY,
+    logger,
+)
+from src.prompts import EMOTION_STABILITY_MODIFICATION
+from src.schemas import TTSParams
+from src.utils import GPTModels, auto_retry
+class TTSParamProcessor:
+    # TODO: refactor to langchain function (?)
+    def __init__(self):
+        self.client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
+    @staticmethod
+    def _wrap_results(data: dict, default_text: str) -> TTSParams:
+        stability = data.get('stability', DEFAULT_TTS_STABILITY)
+        stability = max(stability, DEFAULT_TTS_STABILITY_ACCEPTABLE_RANGE[0])
+        stability = min(stability, DEFAULT_TTS_STABILITY_ACCEPTABLE_RANGE[1])
+        similarity_boost = DEFAULT_TTS_SIMILARITY_BOOST
+        style = DEFAULT_TTS_STYLE
+        params = TTSParams(
+            # NOTE: voice will be set later in the builder pipeline
+            voice_id='',
+            text=default_text,
+            # reference: https://elevenlabs.io/docs/speech-synthesis/voice-settings
+            voice_settings=VoiceSettings(
+                stability=stability,
+                similarity_boost=similarity_boost,
+                style=style,
+                use_speaker_boost=False,
+            ),
+        )
+        return params
+    @auto_retry
+    async def run(self, text: str) -> TTSParams:
+        text_prepared = text.strip()
+        completion = await self.client.chat.completions.create(
+            model=GPTModels.GPT_4o,
+            messages=[
+                {"role": "system", "content": EMOTION_STABILITY_MODIFICATION},
+                {"role": "user", "content": text_prepared},
+            ],
+            response_format={"type": "json_object"},
+        )
+        chatgpt_output = completion.choices[0].message.content
+        if chatgpt_output is None:
+            raise ValueError(f'received None as openai response content')
+        try:
+            output_dict = json.loads(chatgpt_output)
+            logger.info(f"TTS text processing succeeded: {output_dict}")
+        except json.JSONDecodeError as e:
+            logger.exception(f"Error in parsing LLM output: '{chatgpt_output}'")
+            raise e
+        output_wrapped = self._wrap_results(output_dict, default_text=text_prepared)
+        return output_wrapped

src/prompts.py CHANGED Viewed

@@ -1,92 +1,4 @@
-class SplitTextPromptV1:
-    SYSTEM = """\
-You are a helpful assistant proficient in literature and language.
-Imagine you are helping to prepare the provided text for narration to create the audio book.
-We need to understand how many voice actors we need to hire and how to split the text between them.
-Your task is to help with this process, namely:
-1. Identify all book characters occuring in the text, including "narrator".
-We will hire individual voice actor for each one of them.
-2. Split the text provided by characters. Let's refer to each split as "part".
-Order of parts MUST be the same as in the original text.
-Details:
-- First, analyze the whole text to extract the list of characters.
-Put found characters to corresponding output field.
-- Then, analyze the text top-down and as you proceed fill the "parts" field
-- Each part must be attributed to a single character.
-Character must belong to the "characters" list
-- Use "narrator" character for any descriptive or narrative text,
-such as actions ("He shook his head"), narrative parts ("I thought")
-thoughts, or descriptions that aren't part of spoken dialogue
-- In some books narrator is one of the main characters, having its own name and phrases.
-In this case, use regualar character name instead of "narrator" role
-- If it's impossible to identify character name from the text provided, use codes "c1", "c2", etc,
-where "c" prefix means character and number is used to enumerate unknown characters
-Format your answer as a following JSON:
-{{
-    "characters": [list of unique character names that are found in the text provided],
-    "parts":
-    [
-        {{
-            "character": <character name>, "text": <the part's text>
-        }}
-    ]
-}}
-Ensure the order of the parts in the JSON output matches the original order of the text.
-Examples of text split by characters, already in the target format.
-Example 1.
-{{
-    "characters": ["Mr. Gatz", "narrator"],
-    "parts":
-    [
-        {{"character": "Mr. Gatz", "text": "“Gatz is my name.”"}},
-        {{"character": "narrator", "text": "“—Mr. Gatz. I thought you might want to take the body West.” He shook his head."}},
-        {{"character": "Mr. Gatz", "text": "“Jimmy always liked it better down East. He rose up to his position in the East. Were you a friend of my boy’s, Mr.—?”"}},
-        {{"character": "narrator", "text": "“We were close friends.”"}},
-        {{"character": "Mr. Gatz", "text": "“He had a big future before him, you know. He was only a young man, but he had a lot of brain power here.”"}},
-        {{"character": "narrator", "text": "He touched his head impressively, and I nodded."}},
-        {{"character": "Mr. Gatz", "text": "“If he’d of lived, he’d of been a great man. A man like James J. Hill. He’d of helped build up the country.”"}},
-        {{"character": "narrator", "text": "“That’s true,” I said, uncomfortably."}},
-        {{"character": "Mr. Gatz", "text": "He fumbled at the embroidered coverlet, trying to take it from the bed, and lay down stiffly—was instantly asleep."}},
-    ]
-}}
-Example 2.
-{{
-    'characters': [
-        'narrator',
-        'Mr. Carraway',
-        'Daisy',
-        'Miss Baker',
-        'Tom',
-        'Nick'
-    ],
-    'parts': [
-        {{'character': 'narrator', 'text': '“If you’ll get up.”'}},
-        {{'character': 'Mr. Carraway', 'text': '“I will. Good night, Mr. Carraway. See you anon.”'}},
-        {{'character': 'Daisy', 'text': '“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”'}},
-        {{'character': 'Miss Baker', 'text': '“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”'}},
-        {{'character': 'Tom', 'text': '“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”'}},
-        {{'character': 'Daisy', 'text': '“Who oughtn’t to?” inquired Daisy coldly.'}},
-        {{'character': 'narrator', 'text': '“Her family.”'}},
-        {{'character': 'narrator', 'text': '“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”'}},
-        {{'character': 'narrator', 'text': 'Daisy and Tom looked at each other for a moment in silence.'}}
-    ]
-}}
-"""
-    USER = """\
-Here is the book sample:
----
-{text}"""
-class SplitTextPromptV2:
     SYSTEM = """\
 you are provided with the book sample.
 please rewrite it and insert xml tags indicating character to whom current phrase belongs.
@@ -111,6 +23,36 @@ Here is the book sample:
 {text}"""
 class CharacterVoicePropertiesPrompt:
     SYSTEM = """\
 You are a helpful assistant proficient in literature and psychology.
@@ -156,3 +98,208 @@ NOTES:
 {characters}
 </characters>
 """

+class SplitTextPrompt:
     SYSTEM = """\
 you are provided with the book sample.
 please rewrite it and insert xml tags indicating character to whom current phrase belongs.
 {text}"""
+class ModifyTextPrompt:
+    SYSTEM = """\
+You are provided with the book sample.
+You should help me to make an audiobook with exaggerated emotion-based voice using Text-to-Speech models.
+Your task is to adjust the emotional tone of a given text by modifying the text in the following ways:
+- add special characters: "!" (adds emphasis), "?" (enhances question intonation), "..." (adds pause)
+- write words in uppercase - to add emphasis or convey emotion
+For example:
+Text: "I can't believe this is happening. Who would expect it?"
+Output text: "I CAN'T believe this is happening... Who would expect it??"
+Notes:
+- Do not remove or add any words!
+- You are allowed ONLY to add "!", "?", "..." symbols and re-write existing words in uppercase!
+- To add more emotions, you can duplicate exclamation or question marks, for example: "!!!" or "???"
+- DO NOT place "!" or "?" INSIDE existing sentences, since it breaks the sentence in parts
+- Be generous on pauses between sentences or between the obviously different parts of the same sentence.
+Reason is TTS model tends to dub with fast speed.
+- But don't add too many pauses within one sentence! Add them only where needed.
+- Remember: sentences must sound naturally, in the way a profession voice actor would read it!
+- DO NOT add pauses in the very end of the given text!
+"""
+    USER = """\
+Here is the book sample:
+---
+{text}"""
 class CharacterVoicePropertiesPrompt:
     SYSTEM = """\
 You are a helpful assistant proficient in literature and psychology.
 {characters}
 </characters>
 """
+class SoundEffectsPrompt:
+    SYSTEM = """\
+You are an expert in directing audiobooks creation.
+Your task is to design sound effects (by writing their text description) layed over the voice actors narration.
+Sound effects descriptions are going to be passed to text-to-sound-effect AI model.
+Sound effects must enhance storytelling and evoke immersive experience in listeners.
+You are provided with the audiobook text chunk -
+you must insert XML tags containing prompts for AI model describing sound effects.
+XML effect tags must have following structure:
+<effect prompt="prompt to be passed to text-to-sound-effect AI model">original line from the text</effect>
+WRITE PROMPTS TO BE VERY RICH IN DETAILS, precisely describing the effect!
+Your prompts MUST BE SPECIFIC, AVOID ABSTRACT sounds like "sound of a cozy room".
+Generated sound effect will be overlayed over the text between the opening and the closing effect XML tag.
+Use tags position to control start time of the effect and its duration.
+Additional requirements:
+- In the very beginning, analyze the whole text chunk provided in order to understand events and atmosphere.
+- Aim for episodical sound effects, highlighting atmosphere and characters' actions.
+For example, cracking of stairs, wind blowing, car honks, sound of a falling book, ticking clock
+- NEVER generate background music
+- NEVER generate ambient sounds, for example people's voices, sound of the crowd
+- NEVER generate sounds for gestures, for example for a hand raised in the air.
+- NEVER generate effects for sounds people may produce: laughing, giggling, sobbing, crying, talking, singing, screaming.
+- NEVER generate silence, since it's a too abstract effect
+- The text-to-sound-effects model is able to generate only short audio files, up to 5 seconds long
+- Aim to position sound effects at the most intuitive points for a natural, immersive experience.
+For example, instead of placing the sound effect only on a single word or object (like "stairs"),
+tag a broader phrase making the effect feel part of the action or dialogue.
+- It's allowed to add no sound effects
+Examples of bad prompts:
+1. "brief silence, creating a moment of tension" - it's too short, not specific and is an ambient sound.
+2. "brief, tense silence, filled with unspoken words and a slight undercurrent of tension" - very abstract, and breaks the rule of not generating silence
+3. "sudden burst of bright light filling a room, creating a warm and inviting atmosphere" - abstract
+4. "sudden, commanding gesture of a hand being lifted, creating a brief pause in conversation" - abstract
+5. "exaggerated, humorous emphasis on the age, suggesting an ancient, creaky presence"
+Examples of good prompts:
+1. "soft rustling of paper as a page is turned, delicate and gentle"
+2. "light thud of a magazine landing on a wooden table, slightly echoing in the quiet room"
+3. "Old wooden staircase creaking under slow footsteps, each step producing uneven crackles, groans, and occasional sharp snaps, emphasizing age and fragility in a quiet, echoing space" - it's specific and rich in details
+Response with the original text with selected phrases wrapped inside emotion XML tags.
+Do not modify original text!
+Do not include anythin else in your answer.
+"""
+    USER = """\
+{text}
+"""
+# TODO: this prompt is not used
+PREFIX = """\
+You should help me to make an audiobook with realistic emotion sound using TTS.
+You are tasked with generating a description of sound effects
+that matches the atmosphere, actions, and tone of a given sentence or text from a book.
+The description should be tailored to create a sound effect using ElevenLabs'sound generation API.
+The generated sound description must evoke the scene
+or emotions from the text (e.g., footsteps, wind, tense silence, etc.),
+and it should be succinct and fit the mood of the text."""
+# TODO: this prompt is not used
+SOUND_EFFECT_GENERATION = f"""
+{PREFIX}
+Additionally, you should include the following parameters in your response:
+    Text: A generated description of the sound that matches the text provided.
+        Keep the description simple and effective to capture the soundscape.
+        This text will be converted into a sound effect.
+    Duration_seconds: The appropriate duration of the sound effect,
+        which should be calculated based on the length and nature of the scene.
+        Cap this duration at 22 seconds. But be carefully, for very long text in input make a long sound effect,
+         for small make a small one. And the duration should be similar to duration of input text
+    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
+        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
+        use a value around 0.3. For more specific or detailed sound scenes
+        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
+Your output should be in the following JSON format:
+{{
+  "text": "A soft breeze rustling through leaves, distant birds chirping.",
+  "duration_seconds": 4.0,
+  "prompt_influence": 0.4
+}}
+NOTES:
+- NEVER add any speech or voices in your instructions!
+- NEVER add any music in your instructions!
+- NEVER add city sounds, car honks in your instructions!
+- make your text descriptions VERY SPECIFIC, AVOID vague instructions.
+If it's necessary, you can use couple sentences to formulate the instruction.
+But remember to use keep instructions simple.
+- aim to create specific sounds, like crackling fireplace, footsteps, wind, etc...
+"""
+# TODO: this prompt is not used
+SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION = f"""
+{PREFIX}
+Additionally, you should include the following parameters in your response:
+    Text: A generated description of the sound that matches the text provided.
+        Keep the description simple and effective to capture the soundscape.
+        This text will be converted into a sound effect.
+    Prompt_influence: A value between 0 and 1, where a higher value makes the sound generation closely
+        follow the sound description. For general sound effects (e.g., footsteps, background ambiance),
+        use a value around 0.3. For more specific or detailed sound scenes
+        (e.g., thunderstorm, battle sounds), use a higher value like 0.5 to 0.7.
+Your output should be in the following JSON format:
+{{
+  "text": "A soft breeze rustling through leaves, distant birds chirping.",
+  "prompt_influence": 0.4
+}}"""
+EMOTION_STABILITY_MODIFICATION = """
+You should help me to make an audiobook with exaggerated emotion-based voice using Text-to-Speech.
+Your single task it to select "stability" TTS parameter value,
+based on the emotional intensity level in the provided text chunk.
+Provided text was previously modified by uppercasing some words and adding "!", "?", "..." symbols.
+The more there are uppercase words or "!", "?", "..." symbols, the higher emotional intensity level is.
+Higher emotional intensity must be associated with lower values of "stability" parameter,
+and lower emotional intensity must be associated with higher "stability" values.
+Low "stability" makes TTS to generate more expressive, less stable speech - better suited to convey emotional range.
+Available range for "stability" values is [0.3; 0.8].
+You MUST answer with the following JSON,
+containing a SINGLE "stability" parameter with selected value:
+{"stability": float}
+DO NOT INCLUDE ANYTHING ELSE in your response.
+Example:
+Input: "I CAN'T believe this is happening... Who would expect it??"
+Expected output: {"stability": 0.4}
+"""
+# TODO: this prompt is not used
+TEXT_MODIFICATION_WITH_SSML = """
+You should help me to make an audiobook with overabundant emotion-based voice using TTS.
+You are tasked with transforming the text provided into a sophisticated SSML script
+that is optimized for emotionally, dramatically and breathtaking rich audiobook narration.
+Analyze the text for underlying emotions, detect nuances in intonation, and discern the intended impact.
+Apply suitable SSML enhancements to ensure that the final TTS output delivers
+a powerful, engaging, dramatic and breathtaking listening experience appropriate for an audiobook context
+(more effects/emotions are better than less)."
+Please, use only provided SSML tags and don't generate any other tags.
+Key SSML Tags to Utilize:
+<speak>: This is the root element. All SSML content to be synthesized must be enclosed within this tag.
+<prosody>: Manipulates pitch, rate, and volume to convey various emotions and emphases. Use this tag to adjust the voice to match the mood and tone of different parts of the narrative.
+<break>: Inserts pauses of specified durations. Use this to create natural breaks in speech, aiding in dramatic effect and better comprehension for listeners.
+<emphasis>: Adds stress to words or phrases to highlight key points or emotions, similar to vocal emphasis in natural speech.
+<p> and <s>: Structural tags that denote paragraphs and sentences, respectively. They help to manage the flow and pacing of the narrative appropriately.
+Input Text Example: "He stood there, gazing into the endless horizon. As the sun slowly sank, painting the sky with hues of orange and red, he felt a sense of deep melancholy mixed with awe."
+Modified text should be in the XML format. Expected SSML-enriched Output:
+<speak>
+    <p>
+        <s>
+            He stood there, <prosody rate="slow" volume="soft">gazing into the endless horizon.</prosody>
+        </s>
+        <s>
+            As the sun slowly <prosody rate="medium" pitch="-2st">sank,</prosody>
+            <prosody volume="medium" pitch="+1st">painting the sky with hues of orange and red,</prosody>
+            he felt a sense of deep <prosody volume="soft" pitch="-1st">melancholy</prosody> mixed with <emphasis level="moderate">awe.</emphasis>
+        </s>
+    </p>
+</speak>
+After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
+according to the level of emotional intensity in the modified text.
+Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
+Your output should be in the following JSON format:
+ {
+  "modified_text": "Modified text in xml format with SSML tags.",
+  "params": {
+    "stability": 0.7,
+    "similarity_boost": 0.5,
+    "style": 0.3
+  }
+}
+The "stability" parameter should range from 0 to 1,
+with lower values indicating a more expressive, less stable voice.
+The "similarity_boost" parameter should also range from 0 to 1,
+with higher values indicating more emphasis on the voice similarity.
+The "style" parameter should also range from 0 to 1,
+where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
+Adjust both according to the emotional intensity of the text.
+"""

src/schemas.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from __future__ import annotations
+import base64
+import typing as t
+from enum import StrEnum
+import pandas as pd
+from elevenlabs import VoiceSettings
+from pydantic import BaseModel, ConfigDict, Field
+from src import utils
+class AudioOutputFormat(StrEnum):
+    MP3_22050_32 = "mp3_22050_32"
+    MP3_44100_32 = "mp3_44100_32"
+    MP3_44100_64 = "mp3_44100_64"
+    MP3_44100_96 = "mp3_44100_96"
+    MP3_44100_128 = "mp3_44100_128"
+    MP3_44100_192 = "mp3_44100_192"
+    PCM_16000 = "pcm_16000"
+    PCM_22050 = "pcm_22050"
+    PCM_24000 = "pcm_24000"
+    PCM_44100 = "pcm_44100"
+    ULAW_8000 = "ulaw_8000"
+class ExtraForbidModel(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+# use Ellipsis to mark omitted function parameter.
+# cast it to Any type to avoid warnings from type checkers
+# exact same approach is used in elevenlabs client.
+OMIT = t.cast(t.Any, ...)
+class TTSParams(ExtraForbidModel):
+    # NOTE: pydantic treats Ellipsis as a mark of a required field.
+    # in order to set Ellipsis as actual field default value, we need to use workaround
+    # and use Field's default_factory
+    voice_id: str
+    text: str
+    # enable_logging: typing.Optional[bool] = None
+    # NOTE: we opt for quality over speed - thus don't use this param
+    # optimize_streaming_latency: typing.Optional[OptimizeStreamingLatency] = None
+    # NOTE: here we set default different from 11labs API
+    # output_format: AudioOutputFormat = AudioOutputFormat.MP3_44100_128
+    output_format: AudioOutputFormat = AudioOutputFormat.MP3_44100_192
+    # NOTE: pydantic has protected "model_" namespace.
+    # here we use workaround to pass "model_id" param to 11labs client
+    # via serialization_alias
+    audio_model_id: t.Optional[str] = Field(
+        default_factory=lambda: OMIT, serialization_alias="model_id"
+    )
+    language_code: t.Optional[str] = Field(default_factory=lambda: OMIT)
+    # reference: https://elevenlabs.io/docs/speech-synthesis/voice-settings
+    voice_settings: t.Optional[VoiceSettings] = Field(default_factory=lambda: OMIT)
+    # pronunciation_dictionary_locators: t.Optional[
+    #     t.Sequence[PronunciationDictionaryVersionLocator]
+    # ] = Field(default_factory=lambda: OMIT)
+    seed: t.Optional[int] = Field(default_factory=lambda: OMIT)
+    previous_text: t.Optional[str] = Field(default_factory=lambda: OMIT)
+    next_text: t.Optional[str] = Field(default_factory=lambda: OMIT)
+    previous_request_ids: t.Optional[t.Sequence[str]] = Field(default_factory=lambda: OMIT)
+    next_request_ids: t.Optional[t.Sequence[str]] = Field(default_factory=lambda: OMIT)
+    # request_options: t.Optional[RequestOptions] = None
+    def to_dict(self):
+        """
+        dump the pydantic model in the format required by 11labs api.
+        NOTE: we need to use `by_alias=True` in order to correctly handle
+        alias for `audio_model_id` field,
+        since model_id belongs to pydantic protected namespace.
+        NOTE: we also ignore all fields with default Ellipsis value,
+        since 11labs will assign Ellipses itself,
+        and we won't get any warning in logs.
+        """
+        ellipsis_fields = {field for field, value in self if value is ...}
+        res = self.model_dump(by_alias=True, exclude=ellipsis_fields)
+        return res
+class TTSTimestampsAlignment(ExtraForbidModel):
+    characters: list[str]
+    character_start_times_seconds: list[float]
+    character_end_times_seconds: list[float]
+    _text_joined: str
+    def __init__(self, **data):
+        super().__init__(**data)
+        self._text_joined = "".join(self.characters)
+    @property
+    def text_joined(self):
+        return self._text_joined
+    def to_dataframe(self):
+        return pd.DataFrame(
+            {
+                "char": self.characters,
+                "start": self.character_start_times_seconds,
+                "end": self.character_end_times_seconds,
+            }
+        )
+    @classmethod
+    def combine_alignments(
+        cls,
+        alignments: list[TTSTimestampsAlignment],
+        add_placeholders: bool = False,
+        pause_bw_chunks_s: float = 0.2,
+    ) -> TTSTimestampsAlignment:
+        """
+        Combine alignemnts created for different TTS phrases in a single aligment for a whole text.
+        NOTE: while splitting original text into character phrases,
+        we ignore separators between phrases.
+        They may be different: single or multiple spaces, newlines, etc.
+        To account for them we insert fixed pause and characters between phrases in final alignment.
+        This will give use an approximation of a real timestamp mapping
+        for voicing a whole original text.
+        NOTE: The quality of such approximation seems appropriate,
+        considering the amount of time required to implement more accurate mapping.
+        """
+        chars = []
+        starts = []
+        ends = []
+        prev_chunk_end_time = 0.0
+        n_alignments = len(alignments)
+        for ix, a in enumerate(alignments):
+            cur_starts_absolute = [prev_chunk_end_time + s for s in a.character_start_times_seconds]
+            cur_ends_absolute = [prev_chunk_end_time + e for e in a.character_end_times_seconds]
+            chars.extend(a.characters)
+            starts.extend(cur_starts_absolute)
+            ends.extend(cur_ends_absolute)
+            if ix < n_alignments - 1 and add_placeholders:
+                chars.append('#')
+                placeholder_start = cur_ends_absolute[-1]
+                starts.append(placeholder_start)
+                ends.append(placeholder_start + pause_bw_chunks_s)
+            prev_chunk_end_time = ends[-1]
+        return cls(
+            characters=chars,
+            character_start_times_seconds=starts,
+            character_end_times_seconds=ends,
+        )
+    def filter_chars_without_duration(self):
+        """
+        Create new class instance with characters with 0 duration removed.
+        Needed to provide correct alignment when overlaying sound effects.
+        """
+        df = self.to_dataframe()
+        mask = (df['start'] - df['end']).abs() > 1e-5
+        df = df[mask]
+        res = TTSTimestampsAlignment(
+            characters=df['char'].to_list(),
+            character_start_times_seconds=df['start'].to_list(),
+            character_end_times_seconds=df['end'].to_list(),
+        )
+        return res
+    def get_start_time_by_char_ix(self, char_ix: int, safe=True):
+        if safe:
+            char_ix = utils.get_collection_safe_index(
+                ix=char_ix,
+                collection=self.character_start_times_seconds,
+            )
+        return self.character_start_times_seconds[char_ix]
+    def get_end_time_by_char_ix(self, char_ix: int, safe=True):
+        if safe:
+            char_ix = utils.get_collection_safe_index(
+                ix=char_ix,
+                collection=self.character_end_times_seconds,
+            )
+        return self.character_end_times_seconds[char_ix]
+class TTSTimestampsResponse(ExtraForbidModel):
+    audio_base64: str
+    alignment: TTSTimestampsAlignment
+    normalized_alignment: TTSTimestampsAlignment
+    @property
+    def audio_bytes(self):
+        return base64.b64decode(self.audio_base64)
+    def write_audio_to_file(self, filepath_no_ext: str, audio_format: AudioOutputFormat) -> str:
+        if audio_format.startswith("pcm_"):
+            sr = int(audio_format.removeprefix("pcm_"))
+            fp = f"{filepath_no_ext}.wav"
+            utils.write_raw_pcm_to_file(
+                data=self.audio_bytes,
+                fp=fp,
+                n_channels=1,  # seems like it's 1 channel always
+                bytes_depth=2,  # seems like it's 2 bytes always
+                sampling_rate=sr,
+            )
+            return fp
+        elif audio_format.startswith("mp3_"):
+            fp = f"{filepath_no_ext}.mp3"
+            # received mp3 seems to already contain all required metadata
+            # like sampling rate
+            # and sample width
+            utils.write_bytes(data=self.audio_bytes, fp=fp)
+            return fp
+        else:
+            raise ValueError(f"don't know how to write audio format: {audio_format}")
+class SoundEffectsParams(ExtraForbidModel):
+    text: str
+    duration_seconds: float | None
+    prompt_influence: float | None

src/select_voice_chain.py CHANGED Viewed

@@ -10,10 +10,9 @@ from langchain_core.prompts import (
 from langchain_core.runnables import RunnablePassthrough
 from pydantic import BaseModel
-from src.config import logger
 from src.prompts import CharacterVoicePropertiesPrompt
 from src.utils import GPTModels, get_chat_llm
-from src.config import VOICES_CSV_FP
 class Property(StrEnum):
@@ -121,9 +120,7 @@ class VoiceSelector:
         return character2voice
-    def _remove_hallucinations_single_character(
-        self, character_props: CharacterProperties
-    ):
         def _process_prop(prop: Property, value: str):
             if value not in self.PROPERTY_VALUES[prop]:
                 logger.warning(
@@ -134,9 +131,7 @@ class VoiceSelector:
         return CharacterPropertiesNullable(
             gender=_process_prop(prop=Property.gender, value=character_props.gender),
-            age_group=_process_prop(
-                prop=Property.age_group, value=character_props.age_group
-            ),
         )
     def remove_hallucinations(
@@ -167,28 +162,20 @@ class VoiceSelector:
         prompt = ChatPromptTemplate.from_messages(
             [
-                SystemMessagePromptTemplate.from_template(
-                    CharacterVoicePropertiesPrompt.SYSTEM
-                ),
-                HumanMessagePromptTemplate.from_template(
-                    CharacterVoicePropertiesPrompt.USER
-                ),
             ]
         )
         prompt = prompt.partial(
             **{
                 "available_genders": self.get_available_properties_str(Property.gender),
-                "available_age_groups": self.get_available_properties_str(
-                    Property.age_group
-                ),
                 "format_instructions": format_instructions,
             }
         )
         chain = (
-            RunnablePassthrough.assign(
-                charater_props=prompt | llm | self.remove_hallucinations
-            )
             | RunnablePassthrough.assign(character2voice=self.get_voices)
             | self.pack_results
         )

 from langchain_core.runnables import RunnablePassthrough
 from pydantic import BaseModel
+from src.config import VOICES_CSV_FP, logger
 from src.prompts import CharacterVoicePropertiesPrompt
 from src.utils import GPTModels, get_chat_llm
 class Property(StrEnum):
         return character2voice
+    def _remove_hallucinations_single_character(self, character_props: CharacterProperties):
         def _process_prop(prop: Property, value: str):
             if value not in self.PROPERTY_VALUES[prop]:
                 logger.warning(
         return CharacterPropertiesNullable(
             gender=_process_prop(prop=Property.gender, value=character_props.gender),
+            age_group=_process_prop(prop=Property.age_group, value=character_props.age_group),
         )
     def remove_hallucinations(
         prompt = ChatPromptTemplate.from_messages(
             [
+                SystemMessagePromptTemplate.from_template(CharacterVoicePropertiesPrompt.SYSTEM),
+                HumanMessagePromptTemplate.from_template(CharacterVoicePropertiesPrompt.USER),
             ]
         )
         prompt = prompt.partial(
             **{
                 "available_genders": self.get_available_properties_str(Property.gender),
+                "available_age_groups": self.get_available_properties_str(Property.age_group),
                 "format_instructions": format_instructions,
             }
         )
         chain = (
+            RunnablePassthrough.assign(charater_props=prompt | llm | self.remove_hallucinations)
             | RunnablePassthrough.assign(character2voice=self.get_voices)
             | self.pack_results
         )

src/sound_effects_design.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import re
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain_core.runnables import RunnablePassthrough
+from pydantic import BaseModel
+from src import prompts
+from src.utils import GPTModels, get_chat_llm
+class SoundEffectDescription(BaseModel):
+    prompt: str
+    text_between_tags: str
+    # indices relative to LLM response
+    ix_start_llm_response: int
+    ix_end_llm_response: int
+    # indices relative to original text passed to LLM
+    ix_start_orig_text: int
+    ix_end_orig_text: int
+    # NOTE: start_sec and duration_sec fields
+    # are going to be filled once TTS audio is generated
+    start_sec: float = -1.0
+    duration_sec: float = -1.0
+class SoundEffectsDesignOutput(BaseModel):
+    text_raw: str
+    text_annotated: str
+    _sound_effects_descriptions: list[SoundEffectDescription]
+    @staticmethod
+    def _parse_effects_xml_tags(text) -> list[SoundEffectDescription]:
+        """
+        we rely on LLM to format response correctly.
+        and currently don't try to fix possible errors.
+        """
+        # TODO: allow to open-close tags
+        # <effect prompt=\"(.*?)\" duration=\"(.*)\"/>
+        pattern = re.compile(r"<effect prompt=(?:\"|')(.*?)(?:\"|')>(.*?)</effect>")
+        all_matches = list(pattern.finditer(text))
+        sound_effects_descriptions = []
+        rm_chars_running_total = 0
+        for m in all_matches:
+            mstart, mend = m.span()
+            prompt = m.group(1)
+            text_between_tags = m.group(2)
+            ix_start_orig = mstart - rm_chars_running_total
+            ix_end_orig = ix_start_orig + len(text_between_tags)
+            sound_effects_descriptions.append(
+                SoundEffectDescription(
+                    prompt=prompt,
+                    text_between_tags=text_between_tags,
+                    ix_start_llm_response=mstart,
+                    ix_end_llm_response=mend,
+                    ix_start_orig_text=ix_start_orig,
+                    ix_end_orig_text=ix_end_orig,
+                )
+            )
+            mlen = mend - mstart
+            rm_chars_running_total += mlen - len(text_between_tags)
+        return sound_effects_descriptions
+    def __init__(self, **data):
+        super().__init__(**data)
+        self._sound_effects_descriptions = self._parse_effects_xml_tags(self.text_annotated)
+    @property
+    def sound_effects_descriptions(self) -> list[SoundEffectDescription]:
+        return self._sound_effects_descriptions
+def create_sound_effects_design_chain(llm_model: GPTModels):
+    llm = get_chat_llm(llm_model=llm_model, temperature=0.0)
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            SystemMessagePromptTemplate.from_template(prompts.SoundEffectsPrompt.SYSTEM),
+            HumanMessagePromptTemplate.from_template(prompts.SoundEffectsPrompt.USER),
+        ]
+    )
+    chain = RunnablePassthrough.assign(text_annotated=prompt | llm | StrOutputParser()) | (
+        lambda inputs: SoundEffectsDesignOutput(
+            text_raw=inputs["text"], text_annotated=inputs["text_annotated"]
+        )
+    )
+    return chain

src/text_modification_chain.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from pydantic import BaseModel
+from src.prompts import ModifyTextPrompt
+from src.utils import GPTModels, get_chat_llm
+class ModifiedTextOutput(BaseModel):
+    text_raw: str
+    text_modified: str
+def modify_text_chain(llm_model: GPTModels):
+    llm = get_chat_llm(llm_model=llm_model, temperature=0.0)
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            SystemMessagePromptTemplate.from_template(ModifyTextPrompt.SYSTEM),
+            HumanMessagePromptTemplate.from_template(ModifyTextPrompt.USER),
+        ]
+    )
+    chain = RunnablePassthrough.assign(text_modified=prompt | llm | StrOutputParser()) | (
+        lambda inputs: ModifiedTextOutput(
+            text_raw=inputs["text"], text_modified=inputs["text_modified"]
+        )
+    )
+    return chain

src/text_split_chain.py CHANGED Viewed

@@ -9,7 +9,7 @@ from langchain_core.prompts import (
 from langchain_core.runnables import RunnablePassthrough
 from pydantic import BaseModel
-from src.prompts import SplitTextPromptV1, SplitTextPromptV2
 from src.utils import GPTModels, get_chat_llm
@@ -63,66 +63,14 @@ def create_split_text_chain(llm_model: GPTModels):
     prompt = ChatPromptTemplate.from_messages(
         [
-            SystemMessagePromptTemplate.from_template(SplitTextPromptV2.SYSTEM),
-            HumanMessagePromptTemplate.from_template(SplitTextPromptV2.USER),
         ]
     )
-    chain = RunnablePassthrough.assign(
-        text_annotated=prompt | llm | StrOutputParser()
-    ) | (
         lambda inputs: SplitTextOutput(
             text_raw=inputs["text"], text_annotated=inputs["text_annotated"]
         )
     )
     return chain
-###### old code ######
-class CharacterAnnotatedText(BaseModel):
-    phrases: list[CharacterPhrase]
-    _characters: list[str]
-    def __init__(self, **data):
-        super().__init__(**data)
-        self._characters = list(set(phrase.character for phrase in self.phrases))
-    @property
-    def characters(self):
-        return self._characters
-    def to_pretty_text(self):
-        lines = []
-        lines.append(f"characters: {self.characters}")
-        lines.append("-" * 20)
-        lines.extend(f"[{phrase.character}] {phrase.text}" for phrase in self.phrases)
-        res = "\n".join(lines)
-        return res
-class SplitTextOutputOld(BaseModel):
-    characters: list[str]
-    parts: list[CharacterPhrase]
-    def to_character_annotated_text(self):
-        return CharacterAnnotatedText(phrases=self.parts)
-def create_split_text_chain_old(llm_model: GPTModels):
-    llm = get_chat_llm(llm_model=llm_model, temperature=0.0)
-    llm = llm.with_structured_output(SplitTextOutputOld, method="json_mode")
-    prompt = ChatPromptTemplate.from_messages(
-        [
-            SystemMessagePromptTemplate.from_template(SplitTextPromptV1.SYSTEM),
-            HumanMessagePromptTemplate.from_template(SplitTextPromptV1.USER),
-        ]
-    )
-    chain = prompt | llm
-    return chain
-## end of old code ##

 from langchain_core.runnables import RunnablePassthrough
 from pydantic import BaseModel
+from src.prompts import SplitTextPrompt
 from src.utils import GPTModels, get_chat_llm
     prompt = ChatPromptTemplate.from_messages(
         [
+            SystemMessagePromptTemplate.from_template(SplitTextPrompt.SYSTEM),
+            HumanMessagePromptTemplate.from_template(SplitTextPrompt.USER),
         ]
     )
+    chain = RunnablePassthrough.assign(text_annotated=prompt | llm | StrOutputParser()) | (
         lambda inputs: SplitTextOutput(
             text_raw=inputs["text"], text_annotated=inputs["text_annotated"]
         )
     )
     return chain

src/tts.py CHANGED Viewed

@@ -1,32 +1,19 @@
 import typing as t
 from dotenv import load_dotenv
-from elevenlabs.client import AsyncElevenLabs, ElevenLabs
 from elevenlabs import VoiceSettings
 load_dotenv()
-from src.config import logger, ELEVENLABS_API_KEY
 from src.utils import auto_retry
-ELEVEN_CLIENT = ElevenLabs(api_key=ELEVENLABS_API_KEY)
 ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=ELEVENLABS_API_KEY)
-def tts_stream(voice_id: str, text: str) -> t.Iterator[bytes]:
-    async_iter = ELEVEN_CLIENT.text_to_speech.convert(voice_id=voice_id, text=text)
-    for chunk in async_iter:
-        if chunk:
-            yield chunk
-def tts(voice_id: str, text: str):
-    tts_iter = tts_stream(voice_id=voice_id, text=text)
-    combined = b"".join(tts_iter)
-    return combined
 async def tts_astream(
     voice_id: str, text: str, params: dict | None = None
 ) -> t.AsyncIterator[bytes]:
@@ -50,26 +37,47 @@ async def tts_astream(
 @auto_retry
-async def tts_astream_consumed(
-    voice_id: str, text: str, params: dict | None = None
-) -> list[bytes]:
     aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
     return [x async for x in aiterator]
-async def sound_generation_astream(
-    sound_generation_data: dict,
-) -> t.AsyncIterator[bytes]:
-    text = sound_generation_data.pop("text")
     logger.info(
-        f"request to 11labs sound effect generation with params {sound_generation_data} "
-        f'for the following text: "{text}"'
     )
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
-        text=text,
-        duration_seconds=sound_generation_data["duration_seconds"],
-        prompt_influence=sound_generation_data["prompt_influence"],
     )
     async for chunk in async_iter:
         if chunk:
@@ -77,6 +85,6 @@ async def sound_generation_astream(
 @auto_retry
-async def sound_generation_consumed(sound_generation_data: dict):
-    aiterator = sound_generation_astream(sound_generation_data=sound_generation_data)
     return [x async for x in aiterator]

 import typing as t
+from copy import deepcopy
 from dotenv import load_dotenv
 from elevenlabs import VoiceSettings
+from elevenlabs.client import AsyncElevenLabs
 load_dotenv()
+from src.config import ELEVENLABS_API_KEY, logger
+from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsResponse
 from src.utils import auto_retry
 ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=ELEVENLABS_API_KEY)
 async def tts_astream(
     voice_id: str, text: str, params: dict | None = None
 ) -> t.AsyncIterator[bytes]:
 @auto_retry
+async def tts_astream_consumed(voice_id: str, text: str, params: dict | None = None) -> list[bytes]:
     aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
     return [x async for x in aiterator]
+@auto_retry
+async def tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
+    async def _tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
+        # NOTE: we need to use special `to_dict()` method to ensure pydantic model is converted
+        # to dict with proper aliases
+        params_dict = params.to_dict()
+        params_no_text = deepcopy(params_dict)
+        text = params_no_text.pop('text')
+        logger.info(
+            f"request to 11labs TTS endpoint with params {params_no_text} "
+            f'for the following text: "{text}"'
+        )
+        response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
+            **params_dict
+        )
+        response_parsed = TTSTimestampsResponse.model_validate(response_raw)
+        return response_parsed
+    res = await _tts_w_timestamps(params=params)
+    return res
+async def sound_generation_astream(params: SoundEffectsParams) -> t.AsyncIterator[bytes]:
+    params_no_text = params.model_dump(exclude={"text"})
     logger.info(
+        f"request to 11labs sound effect generation with params {params_no_text} "
+        f'for the following text: "{params.text}"'
     )
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
+        text=params.text,
+        duration_seconds=params.duration_seconds,
+        prompt_influence=params.prompt_influence,
     )
     async for chunk in async_iter:
         if chunk:
 @auto_retry
+async def sound_generation_consumed(params: SoundEffectsParams):
+    aiterator = sound_generation_astream(params=params)
     return [x async for x in aiterator]

src/utils.py CHANGED Viewed

@@ -1,12 +1,20 @@
 from enum import StrEnum
 from httpx import Timeout
 from langchain_openai import ChatOpenAI
-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_random_exponential,
-)
 class GPTModels(StrEnum):
@@ -17,18 +25,148 @@ class GPTModels(StrEnum):
 def get_chat_llm(llm_model: GPTModels, temperature=0.0):
     llm = ChatOpenAI(
-        model=llm_model, temperature=temperature, timeout=Timeout(60, connect=4)
     )
     return llm
 async def consume_aiter(aiterator):
     return [x async for x in aiterator]
 def auto_retry(f):
     decorator = retry(
-        wait=wait_random_exponential(min=2, max=6),
-        stop=stop_after_attempt(10),
     )
     return decorator(f)

+import datetime
+import json
+import re
+import shutil
+import typing as t
+import wave
+from collections.abc import Sized
 from enum import StrEnum
+from pathlib import Path
+import pandas as pd
 from httpx import Timeout
 from langchain_openai import ChatOpenAI
+from pydub import AudioSegment
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from src.config import logger, VOICES_CSV_FP
 class GPTModels(StrEnum):
 def get_chat_llm(llm_model: GPTModels, temperature=0.0):
     llm = ChatOpenAI(
+        model=llm_model,
+        temperature=temperature,
+        timeout=Timeout(60, connect=4),
     )
     return llm
+def get_collection_safe_index(ix: int, collection: Sized):
+    res = min(ix, len(collection) - 1)
+    res = max(0, res)
+    return res
+def write_txt(txt: str, fp: str):
+    with open(fp, 'w', encoding='utf-8') as fout:
+        fout.write(txt)
+def write_json(data, fp: str, indent=2):
+    with open(fp, 'w', encoding='utf-8') as fout:
+        json.dump(data, fout, indent=indent, ensure_ascii=False)
+def rm_dir_conditional(dp: str, to_remove=True):
+    if not to_remove:
+        return
+    logger.info(f'removing dir: "{dp}"')
+    try:
+        shutil.rmtree(dp)
+    except Exception:
+        logger.exception(f'failed to remove dir')
+def get_utc_now_str():
+    now = datetime.datetime.now(tz=datetime.UTC)
+    now_str = now.strftime('%Y%m%d-%H%M%S')
+    return now_str
 async def consume_aiter(aiterator):
     return [x async for x in aiterator]
 def auto_retry(f):
     decorator = retry(
+        wait=wait_random_exponential(min=3, max=10),
+        stop=stop_after_attempt(20),
     )
     return decorator(f)
+def write_bytes(data: bytes, fp: str):
+    logger.info(f'saving to: "{fp}"')
+    with open(fp, "wb") as fout:
+        fout.write(data)
+def write_chunked_bytes(data: t.Iterable[bytes], fp: str):
+    logger.info(f'saving to: "{fp}"')
+    with open(fp, "wb") as fout:
+        for chunk in data:
+            if chunk:
+                fout.write(chunk)
+def write_raw_pcm_to_file(data: bytes, fp: str, n_channels: int, bytes_depth: int, sampling_rate):
+    logger.info(f'saving to: "{fp}"')
+    with wave.open(fp, "wb") as f:
+        f.setnchannels(n_channels)
+        f.setsampwidth(bytes_depth)
+        f.setframerate(sampling_rate)
+        f.writeframes(data)
+def get_audio_duration(filepath: str) -> float:
+    """
+    Returns the duration of the audio file in seconds.
+    :param filepath: Path to the audio file.
+    :return: Duration of the audio file in seconds.
+    """
+    audio = AudioSegment.from_file(filepath)
+    # Convert milliseconds to seconds
+    duration_in_seconds = len(audio) / 1000
+    return round(duration_in_seconds, 1)
+def normalize_audio(audio_segment: AudioSegment, target_dBFS: float = -20.0) -> AudioSegment:
+    """Normalize an audio segment to the target dBFS level."""
+    delta = target_dBFS - audio_segment.dBFS
+    res = audio_segment.apply_gain(delta)
+    return res
+def overlay_multiple_audio(
+    main_audio_fp: str,
+    audios_to_overlay_fps: list[str],
+    starts_sec: list[float],  # list of start positions, in seconds
+    out_fp: str,
+):
+    main_audio = AudioSegment.from_file(main_audio_fp)
+    for fp, cur_start_sec in zip(audios_to_overlay_fps, starts_sec):
+        audio_to_overlay = AudioSegment.from_file(fp)
+        # NOTE: quote from the documentation:
+        # "The result is always the same length as this AudioSegment"
+        # reference: https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentoverlay
+        # NOTE: `position` params is offset time in milliseconds
+        start_ms = int(cur_start_sec * 1000)
+        main_audio = main_audio.overlay(audio_to_overlay, position=start_ms)
+    logger.info(f'saving overlayed audio to: "{out_fp}"')
+    main_audio.export(out_fp, format='wav')
+def get_audio_from_voice_id(voice_id: str) -> str:
+    voices_df = pd.read_csv(VOICES_CSV_FP)
+    data = voices_df[voices_df["voice_id"] == voice_id]["preview_url"].values[0]
+    return data
+def get_character_color(character: str) -> str:
+    if not character or character == "Unassigned":
+        return "#808080"
+    colors = [
+        "#FF6B6B",  # pale red
+        "#ed1262",  # magenta-red
+        "#ed2bac",  # magenta
+        "#892ed5",  # purple
+        "#4562f7",  # blue
+        "#11ab99",  # cyan
+        "#58f23a",  # green
+        # "#96CEB4",  # light green
+        # "#D4A5A5",  # light red
+    ]
+    hash_val = sum(ord(c) for c in character)
+    return colors[hash_val % len(colors)]
+def prettify_unknown_character_label(text):
+    return re.sub(r'\bc(\d+)\b', r'Character\1', text)
+def hex_to_rgb(hex_color):
+    hex_color = hex_color.lstrip('#')
+    return f"{int(hex_color[0:2], 16)},{int(hex_color[2:4], 16)},{int(hex_color[4:6], 16)}"

src/web/constructor.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from src.web.utils import create_status_html
+class HTMLGenerator:
+    @staticmethod
+    def generate_error(text: str) -> str:
+        return create_status_html("Error", [], error_text=text)
+    @staticmethod
+    def generate_status(stage_title: str, steps: list[tuple[str, bool]]) -> str:
+        return create_status_html(stage_title, steps) + "</div>"
+    @staticmethod
+    def generate_text_split(text_split_html: str) -> str:
+        return f'''
+            <div class="section" style="background-color: #31395294; padding: 1rem; border-radius: 8px; margin-top: 1rem; color: #e0e0e0;">
+                <h3 style="color: rgb(224, 224, 224); font-size: 1.5em; margin-bottom: 1rem;">Text Split by Character:</h3>
+                {text_split_html}
+            </div>
+        '''
+    @staticmethod
+    def generate_voice_assignments(voice_assignments_html: str) -> str:
+        return f'''
+            <div class="section" style="background-color: #31395294; padding: 1rem; border-radius: 8px; margin-top: 1rem; color: #e0e0e0;">
+                <h3 style="color: rgb(224, 224, 224); font-size: 1.5em; margin-bottom: 1rem;">Voice Assignments:</h3>
+                {voice_assignments_html}
+            </div>
+        '''
+    @staticmethod
+    def generate_message_without_voice_id() -> str:
+        return '''
+                <div class="audiobook-ready" style="background-color: #31395294; padding: 1rem; border-radius: 8px; margin-top: 1rem; text-align: center;">
+                    <h3 style="color: rgb(224, 224, 224); font-size: 1.5em; margin-bottom: 1rem;">🫤 At first you should add your voice</h3>
+                </div>
+            '''
+    @staticmethod
+    def generate_final_message() -> str:
+        return '''
+            <div class="audiobook-ready" style="background-color: #31395294; padding: 1rem; border-radius: 8px; margin-top: 1rem; text-align: center;">
+                <h3 style="color: rgb(224, 224, 224); font-size: 1.5em; margin-bottom: 1rem;">🎉 Your audiobook is ready!</h3>
+            </div>
+        '''

src/web/utils.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from src.sound_effects_design import SoundEffectDescription
+from src.text_split_chain import CharacterPhrase
+from src.utils import (
+    get_audio_from_voice_id,
+    get_character_color,
+    get_collection_safe_index,
+    hex_to_rgb,
+    prettify_unknown_character_label,
+)
+from src.web.variables import EFFECT_CSS
+def create_status_html(status: str, steps: list[tuple[str, bool]], error_text: str = '') -> str:
+    # CSS for the spinner animation
+    spinner_css = """
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        .spinner {
+            width: 20px;
+            height: 20px;
+            border: 3px solid #e0e0e0;
+            border-top: 3px solid #3498db;
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+            display: inline-block;
+        }
+    """
+    steps_html = "\n".join(
+        [
+            f'<div class="step-item" style="display: flex; align-items: center; padding: 0.8rem; margin-bottom: 0.5rem; background-color: #31395294; border-radius: 6px; font-weight: 600;">'
+            f'<span class="step-icon" style="margin-right: 1rem; font-size: 1.3rem;">'
+            f'{"✅" if completed else "<div class='spinner'></div>"}'
+            f'</span>'
+            f'<span class="step-text" style="font-size: 1.1rem; color: #e0e0e0;">{step}</span>'
+            f'</div>'
+            for step, completed in steps
+        ]
+    )
+    # status_description = '<p class="status-description" style="margin: 0.5rem 0 0 0; color: #c0c0c0; font-size: 1rem; font-weight: 400;">Processing steps below.</p>'
+    status_description = ''
+    if error_text:
+        error_html = f'<div class="error-message" style="color: #e53e3e; font-size: 1.2em;">{error_text}</div></div>'
+    else:
+        error_html = ''
+    return f'''
+    <div class="status-container" style="font-family: system-ui; max-width: 1472px; margin: 0 auto; background-color: #31395294; padding: 1rem; border-radius: 8px; color: #f0f0f0;">
+        <style>{spinner_css}</style>
+        <div class="status-header" style="background: #31395294; padding: 1rem; border-radius: 8px; font-weight: bold;">
+            <h3 class="status-title" style="margin: 0; color: rgb(224, 224, 224); font-size: 1.5rem; font-weight: 700;">Status: {status}</h3>
+            {status_description}
+            {error_html}
+        </div>
+        <div class="steps" style="margin-top: 1rem;">
+            {steps_html}
+        </div>
+    </div>
+    '''
+def create_effect_span_prefix_postfix(effect_description: str):
+    """Create an HTML span with effect tooltip."""
+    # NOTE: it's important not to use multiline python string in order not to add whitespaces
+    prefix = (
+        '<span class="character-segment">'
+        '<span class="effect-container">'
+        '<span class="effect-text">'
+    )
+    postfix = (
+        '</span>'
+        f'<span class="effect-tooltip">Effect: {effect_description}</span>'
+        '</span>'
+        '</span>'
+    )
+    return prefix, postfix
+def create_effect_span(text: str, effect_description: str) -> str:
+    prefix, postfix = create_effect_span_prefix_postfix(effect_description=effect_description)
+    res = f"{prefix}{text}{postfix}"
+    return res
+def create_regular_span(text: str, bg_color: str) -> str:
+    """Create a regular HTML span with background color."""
+    return f'<span class="character-segment" style="background-color: {bg_color}">{text}</span>'
+def _generate_legend_for_text_split_html(
+    character_phrases: list[CharacterPhrase], add_effect_legend: bool = False
+) -> str:
+    html = (
+        "<div style='margin-bottom: 1rem;'>"
+        "<div style='font-size: 1.35em; font-weight: bold;'>Legend:</div>"
+    )
+    unique_characters = set(phrase.character or 'Unassigned' for phrase in character_phrases)
+    characters_sorted = sorted(unique_characters, key=lambda c: c.lower())
+    for character in characters_sorted:
+        color = get_character_color(character)
+        html += f"<div style='color: {color}; font-size: 1.1em; margin-bottom: 0.25rem;'>{character}</div>"
+    if add_effect_legend:
+        html += (
+            '<div style="font-size: 1.1em; margin-bottom: 0.25rem;">'
+            '<span class="effect-text">🎵 #1</span>'
+            ' - sound effect start position (hover to see the prompt)'
+            '</div>'
+        )
+    html += "</div>"
+    return html
+def _generate_text_split_html(
+    character_phrases: list[CharacterPhrase],
+) -> tuple[str, dict[int, int]]:
+    html_items = ["<div style='font-size: 1.2em; line-height: 1.6;'>"]
+    index_mapping = {}  # Mapping from original index to HTML index
+    orig_index = 0  # Index in the original text
+    html_index = len(html_items[0])  # Index in the HTML output
+    for phrase in character_phrases:
+        character = phrase.character or 'Unassigned'
+        text = phrase.text
+        color = get_character_color(character)
+        rgba_color = f"rgba({hex_to_rgb(color)}, 0.5)"
+        prefix = f"<span style='background-color: {rgba_color}; border-radius: 0.2em;'>"
+        suffix = '</span>'
+        # Append the HTML for this phrase
+        html_items.append(f"{prefix}{text}{suffix}")
+        # Map each character index from the original text to the HTML text
+        html_index += len(prefix)
+        for i in range(len(text)):
+            index_mapping[orig_index + i] = html_index + i
+        # Update indices
+        orig_index += len(text)
+        html_index += len(text) + len(suffix)
+    html_items.append("</div>")
+    html = ''.join(html_items)
+    return html, index_mapping
+def generate_text_split_inner_html_no_effect(character_phrases: list[CharacterPhrase]) -> str:
+    legend_html = _generate_legend_for_text_split_html(
+        character_phrases=character_phrases, add_effect_legend=False
+    )
+    text_split_html, char_ix_orig_2_html = _generate_text_split_html(
+        character_phrases=character_phrases
+    )
+    return legend_html + text_split_html
+def generate_text_split_inner_html_with_effects(
+    character_phrases: list[CharacterPhrase],
+    sound_effects_descriptions: list[SoundEffectDescription],
+) -> str:
+    legend_html = _generate_legend_for_text_split_html(
+        character_phrases=character_phrases, add_effect_legend=True
+    )
+    text_split_html, char_ix_orig_2_html = _generate_text_split_html(
+        character_phrases=character_phrases
+    )
+    if not sound_effects_descriptions:
+        return legend_html + text_split_html
+    prev_end = 0
+    content_html_parts = []
+    for ix, sed in enumerate(sound_effects_descriptions, start=1):
+        # NOTE: 'sed' contains approximate indices from the original text.
+        # that's why we use safe conversion before accessing char mapping
+        ix_start = get_collection_safe_index(
+            ix=sed.ix_start_orig_text, collection=char_ix_orig_2_html
+        )
+        # ix_end = get_collection_safe_index(ix=sed.ix_end_orig_text, collection=char_ix_orig_2_html)
+        html_start_ix = char_ix_orig_2_html[ix_start]
+        # html_end_ix = char_ix_orig_2_html[ix_end]  # NOTE: this is incorrect
+        # BUG: here we take exact same number of characters as in text between sound effect tags.
+        # This introduces the bug: HTML text could be included in 'text_under_effect',
+        # due to inaccuracies in 'sed' indices.
+        # html_end_ix = html_start_ix + ix_end - ix_start  # NOTE: this is correct
+        # NOTE: reason is that html may exist between original text characters
+        prefix = text_split_html[prev_end:html_start_ix]
+        if prefix:
+            content_html_parts.append(prefix)
+        # text_under_effect = text_split_html[html_start_ix:html_end_ix]
+        text_under_effect = f'🎵 #{ix}'
+        if text_under_effect:
+            effect_prefix, effect_postfix = create_effect_span_prefix_postfix(
+                effect_description=sed.prompt
+            )
+            text_under_effect_wrapped = f'{effect_prefix}{text_under_effect}{effect_postfix}'
+            content_html_parts.append(text_under_effect_wrapped)
+        # prev_end = html_end_ix
+        prev_end = html_start_ix
+    last = text_split_html[prev_end:]
+    if last:
+        content_html_parts.append(last)
+    content_html = ''.join(content_html_parts)
+    content_html = f'{EFFECT_CSS}<div class="text-effect-container">{content_html}</div>'
+    html = legend_html + content_html
+    return html
+def generate_voice_mapping_inner_html(select_voice_chain_out):
+    character2props = {}
+    html = AUDIO_PLAYER_CSS
+    for key in set(select_voice_chain_out.character2props) | set(
+        select_voice_chain_out.character2voice
+    ):
+        character_props = select_voice_chain_out.character2props.get(key, []).model_dump()
+        character_props["voice_id"] = select_voice_chain_out.character2voice.get(key, [])
+        character_props["sample_audio_url"] = get_audio_from_voice_id(character_props["voice_id"])
+        character2props[prettify_unknown_character_label(key)] = character_props
+    for character, voice_properties in sorted(character2props.items(), key=lambda x: x[0].lower()):
+        color = get_character_color(character)
+        audio_url = voice_properties.get('sample_audio_url', '')
+        html += f'''
+                <div class="voice-assignment">
+                    <div class="voice-details">
+                        <span class="character-name" style="color: {color};">{character}</span>
+                        <span>→</span>
+                        <span class="voice-props">
+                            Gender: {voice_properties.get('gender', 'N/A')},
+                            Age: {voice_properties.get('age_group', 'N/A')},
+                            Voice ID: {voice_properties.get('voice_id', 'N/A')}
+                        </span>
+                    </div>
+                    <div class="custom-audio-player">
+                        <audio controls preload="none">
+                            <source src="{audio_url}" type="audio/mpeg">
+                            Your browser does not support the audio element.
+                        </audio>
+                    </div>
+                </div>
+            '''
+    return html
+AUDIO_PLAYER_CSS = """\
+<style>
+    .custom-audio-player {
+        display: inline-block;
+        width: 250px;
+        --bg-color: #ff79c6;
+        --highlight-color: #4299e100;
+        --text-color: #e0e0e0;
+        --border-radius: 0px;
+    }
+    .custom-audio-player audio {
+        width: 100%;
+        height: 36px;
+        border-radius: var(--border-radius);
+        background-color: #3f2a2a00;
+        outline: none;
+    }
+    .custom-audio-player audio::-webkit-media-controls-panel {
+        background-color: var(--bg-color);
+    }
+    .custom-audio-player audio::-webkit-media-controls-current-time-display,
+    .custom-audio-player audio::-webkit-media-controls-time-remaining-display {
+        color: var(--text-color);
+    }
+    .custom-audio-player audio::-webkit-media-controls-play-button {
+        background-color: var(--highlight-color);
+        border-radius: 50%;
+        height: 30px;
+        width: 30px;
+    }
+    .custom-audio-player audio::-webkit-media-controls-timeline {
+        background-color: var(--bg-color);
+        height: 6px;
+        border-radius: 3px;
+    }
+    /* Container styles for voice assignment display */
+    .voice-assignment {
+        background-color: rgba(49, 57, 82, 0.8);
+        padding: 1rem;
+        padding-left: 1rem;
+        padding-right: 1rem;
+        padding-top: 0.2rem;
+        padding-bottom: 0.2rem;
+        border-radius: var(--border-radius);
+        margin-top: 0.5rem;
+        color: var(--text-color);
+        display: flex;
+        align-items: center;
+        justify-content: space-between;
+        flex-wrap: wrap;
+        gap: 1rem;
+        border-radius: 7px;
+    }
+    .voice-assignment span {
+        font-weight: 600;
+    }
+    .voice-details {
+        display: flex;
+        align-items: center;
+        gap: 0.5rem;
+    }
+    .character-name {
+        color: var(--highlight-color);
+        font-weight: bold;
+    }
+    .voice-props {
+        color: #4a5568;
+    }
+</style>
+"""

src/web/variables.py ADDED Viewed

	@@ -0,0 +1,517 @@

+from src.config import ELEVENLABS_API_KEY
+DESCRIPTION_JS = """function createGradioAnimation() {
+    // Create main container
+    var container = document.createElement('div');
+    container.id = 'gradio-animation';
+    container.style.padding = '2rem';
+    container.style.background = 'transparent';
+    container.style.borderRadius = '12px';
+    container.style.margin = '0 0 2rem 0';
+    container.style.maxWidth = '100%';
+    container.style.transition = 'all 0.3s ease';
+    // Create header section
+    var header = document.createElement('div');
+    header.style.textAlign = 'center';
+    header.style.marginBottom = '2rem';
+    container.appendChild(header);
+    // Title with spaces
+    var titleText = 'AI   Audio   Books   📕👨‍💻🎧';
+    var title = document.createElement('h1');
+    title.style.fontSize = '2.5rem';
+    title.style.fontWeight = '700';
+    title.style.color = '#f1f1f1';
+    title.style.marginBottom = '1.5rem';
+    title.style.opacity = '0'; // Start with opacity 0
+    title.style.transition = 'opacity 0.5s ease'; // Add transition
+    title.innerText = titleText;
+    header.appendChild(title);
+    // Add description
+    var description = document.createElement('p');
+    description.innerHTML = `
+        <div style="font-size: 1.1rem; color: #c0c0c0; margin-bottom: 2rem; line-height: 1.6;">
+            Create an audiobook from the input text automatically, using Gen-AI!<br>
+            All you need to do - is to input the book text or select it from the provided Sample Inputs.
+        </div>
+    `;
+    description.style.opacity = '0';
+    description.style.transition = 'opacity 0.5s ease';
+    header.appendChild(description);
+    // Create process section
+    var processSection = document.createElement('div');
+    processSection.style.backgroundColor = 'rgba(255, 255, 255, 0.05)';
+    processSection.style.padding = '1.5rem';
+    processSection.style.borderRadius = '8px';
+    processSection.style.marginTop = '1rem';
+    container.appendChild(processSection);
+    // Add "AI will do the rest:" header
+    var processHeader = document.createElement('div');
+    processHeader.style.fontSize = '1.2rem';
+    processHeader.style.fontWeight = '600';
+    processHeader.style.color = '#e0e0e0';
+    processHeader.style.marginBottom = '1rem';
+    processHeader.innerHTML = 'AI will do the rest:';
+    processHeader.style.opacity = '0';
+    processHeader.style.transition = 'opacity 0.5s ease';
+    processSection.appendChild(processHeader);
+    // Define steps with icons
+    var steps = [
+        { text: 'Split text into characters', icon: '📚' },
+        { text: 'Select voice for each character', icon: '🎭' },
+        { text: 'Enhance text to convey emotions and intonations during Text-to-Speech', icon: '😊' },
+        { text: 'Generate audiobook using Text-to-Speech model', icon: '🎧' },
+        { text: 'Generate sound effects to create immersive atmosphere (optional)', icon: '🎵' },
+        { text: 'Clone your voice to generate the audiobook (optional)', icon: '💥' },
+    ];
+    // Create steps list
+    var stepsList = document.createElement('div');
+    stepsList.style.opacity = '0';
+    stepsList.style.transition = 'opacity 0.5s ease';
+    processSection.appendChild(stepsList);
+    steps.forEach(function(step, index) {
+        var stepElement = document.createElement('div');
+        stepElement.style.display = 'flex';
+        stepElement.style.alignItems = 'center';
+        stepElement.style.padding = '0.8rem';
+        stepElement.style.marginBottom = '0.5rem';
+        stepElement.style.backgroundColor = 'rgba(255, 255, 255, 0.03)';
+        stepElement.style.borderRadius = '6px';
+        stepElement.style.transform = 'translateX(-20px)';
+        stepElement.style.opacity = '0';
+        stepElement.style.transition = 'all 0.3s ease';
+        // Add hover effect
+        stepElement.onmouseover = function() {
+            this.style.backgroundColor = 'rgba(255, 255, 255, 0.07)';
+        };
+        stepElement.onmouseout = function() {
+            this.style.backgroundColor = 'rgba(255, 255, 255, 0.03)';
+        };
+        var icon = document.createElement('span');
+        icon.style.marginRight = '1rem';
+        icon.style.fontSize = '1.2rem';
+        icon.innerText = step.icon;
+        stepElement.appendChild(icon);
+        var text = document.createElement('span');
+        text.style.color = '#c0c0c0';
+        text.style.fontSize = '1rem';
+        text.innerText = step.text;
+        stepElement.appendChild(text);
+        stepsList.appendChild(stepElement);
+    });
+    // Insert into Gradio container
+    var gradioContainer = document.querySelector('.gradio-container');
+    gradioContainer.insertBefore(container, gradioContainer.firstChild);
+    // New timing for animations
+    setTimeout(function() {
+        title.style.opacity = '1';
+    }, 250);
+    // Show description after 1 second
+    setTimeout(function() {
+        description.style.opacity = '1';
+        processHeader.style.opacity = '1';
+    }, 700);
+    // Show steps after 2 seconds
+    setTimeout(function() {
+        stepsList.style.opacity = '1';
+        stepsList.querySelectorAll('div').forEach(function(step, index) {
+            setTimeout(function() {
+                step.style.transform = 'translateX(0)';
+                step.style.opacity = '1';
+            }, index * 100);
+        });
+    }, 1100);
+    async function playAudio(url) {
+        try {
+            const audio = new Audio(url);
+            await audio.play();
+        } catch (error) {
+            console.error('Error playing audio:', error);
+        }
+    }
+    // Add click handler to all audio links
+    document.addEventListener('click', function(e) {
+        if (e.target.classList.contains('audio-link')) {
+            e.preventDefault();
+            playAudio(e.target.getAttribute('data-audio-url'));
+        }
+    });
+    return 'Animation created';
+}"""
+STATUS_DISPLAY_HTML = '''
+        <style>
+          .status-container {
+              font-family: system-ui;
+              max-width: 1472;
+              margin: 0 auto;
+              background-color: #31395294; /* Darker background color */
+              padding: 1rem;
+              border-radius: 8px;
+              color: #f0f0f0; /* Light text color */
+          }
+          .status-header {
+              background: #31395294; /* Slightly lighter background */
+              padding: 1rem;
+              border-radius: 8px;
+              font-weight: bold; /* Emphasize header */
+          }
+          .status-title {
+              margin: 0;
+              color: rgb(224, 224, 224); /* White color for title */
+              font-size: 1.5rem; /* Larger title font */
+              font-weight: 700; /* Bold title */
+          }
+          .status-description {
+              margin: 0.5rem 0 0 0;
+              color: #c0c0c0;
+              font-size: 1rem;
+              font-weight: 400; /* Regular weight for description */
+          }
+          .steps {
+              margin-top: 1rem;
+          }
+          .step-item {
+              display: flex;
+              align-items: center;
+              padding: 0.8rem;
+              margin-bottom: 0.5rem;
+              background-color: #31395294; /* Matching background color */
+              border-radius: 6px;
+              color: #f0f0f0; /* Light text color */
+              font-weight: 600; /* Medium weight for steps */
+          }
+          .step-item:hover {
+              background-color: rgba(255, 255, 255, 0.07);
+          }
+          .step-icon {
+              margin-right: 1rem;
+              font-size: 1.3rem; /* Slightly larger icon size */
+          }
+          .step-text {
+              font-size: 1.1rem; /* Larger text for step description */
+              color: #e0e0e0; /* Lighter text for better readability */
+          }
+        </style>
+        <div class="status-container">
+            <div class="status-header">
+                <h2 class="status-title">Status: Waiting to Start</h2>
+                <p class="status-description">Enter text or upload a file to begin.</p>
+            </div>
+        </div>
+        '''
+GRADIO_THEME = "freddyaboulton/dracula_revamped"
+VOICE_UPLOAD_JS = f"""
+async function createVoiceUploadPopup() {{
+    try {{
+        let savedVoiceId = null;
+        const result = await new Promise((resolve, reject) => {{
+            // Create overlay with soft animation
+            const overlay = document.createElement('div');
+            Object.assign(overlay.style, {{
+                position: 'fixed',
+                top: '0',
+                left: '0',
+                width: '100%',
+                height: '100%',
+                backgroundColor: 'rgba(0, 0, 0, 0.8)',
+                display: 'flex',
+                justifyContent: 'center',
+                alignItems: 'center',
+                zIndex: '1000',
+                opacity: '0',
+                transition: 'opacity 0.3s ease-in-out'
+            }});
+            overlay.offsetHeight; // Trigger reflow for transition
+            overlay.style.opacity = '1';
+            // Create popup container with modern design
+            const popup = document.createElement('div');
+            Object.assign(popup.style, {{
+                backgroundColor: '#3b4c63',
+                padding: '30px',
+                borderRadius: '12px',
+                width: '450px',
+                maxWidth: '95%',
+                position: 'relative',
+                boxShadow: '0 10px 25px rgba(0, 0, 0, 0.3)',
+                transform: 'scale(0.9)',
+                transition: 'transform 0.3s ease-out',
+                display: 'flex',
+                flexDirection: 'column',
+                alignItems: 'center'
+            }});
+            popup.offsetHeight; // Trigger reflow
+            popup.style.transform = 'scale(1)';
+            // Create close button
+            const closeBtn = document.createElement('button');
+            Object.assign(closeBtn.style, {{
+                position: 'absolute',
+                right: '15px',
+                top: '15px',
+                border: 'none',
+                background: 'none',
+                fontSize: '24px',
+                cursor: 'pointer',
+                color: '#d3d3d3',
+                transition: 'color 0.2s ease'
+            }});
+            closeBtn.innerHTML = '✕';
+            closeBtn.onmouseover = () => closeBtn.style.color = '#ffffff';
+            closeBtn.onmouseout = () => closeBtn.style.color = '#d3d3d3';
+            // Create content
+            const content = document.createElement('div');
+            content.innerHTML = `
+                <div style="text-align: center; margin-bottom: 25px;">
+                    <h2 style="color: #ffffff; margin: 0; font-size: 22px;">Upload Voice Sample</h2>
+                    <p style="color: #b0b0b0; margin-top: 10px; font-size: 14px;">
+                        Select an audio file to create audiobook with your unique voice.
+                    </p>
+                </div>
+                <div style="margin-bottom: 20px; display: flex; flex-direction: column; align-items: center; width: 100%;">
+                    <label for="voiceFile" style="
+                        display: block;
+                        margin-bottom: 10px;
+                        color: #c0c0c0;
+                        font-weight: 600;
+                        text-align: center;">
+                        Choose Audio File (MP3, WAV, OGG):
+                    </label>
+                    <input type="file" id="voiceFile" accept="audio/*"
+                           style="
+                               width: 100%;
+                               padding: 12px;
+                               border: 2px dashed #4a6f91;
+                               border-radius: 8px;
+                               background-color: #2a3a50;
+                               color: #ffffff;
+                               text-align: center;
+                               transition: border-color 0.3s ease;
+                           ">
+                </div>
+                <div id="uploadStatus" style="
+                    margin-bottom: 15px;
+                    text-align: center;
+                    min-height: 25px;
+                    color: #d3d3d3;">
+                </div>
+                <button id="uploadBtn" style="
+                    background-color: #4a6f91;
+                    color: #ffffff;
+                    padding: 12px 20px;
+                    border: none;
+                    border-radius: 8px;
+                    cursor: pointer;
+                    width: 100%;
+                    font-weight: 600;
+                    transition: background-color 0.3s ease, transform 0.1s ease;
+                ">
+                    Upload Voice
+                </button>
+            `;
+            // Add elements to DOM
+            popup.appendChild(closeBtn);
+            popup.appendChild(content);
+            overlay.appendChild(popup);
+            document.body.appendChild(overlay);
+            // Button effects
+            const uploadBtn = popup.querySelector('#uploadBtn');
+            uploadBtn.onmouseover = () => uploadBtn.style.backgroundColor = '#3b5c77';
+            uploadBtn.onmouseout = () => uploadBtn.style.backgroundColor = '#4a6f91';
+            uploadBtn.onmousedown = () => uploadBtn.style.transform = 'scale(0.98)';
+            uploadBtn.onmouseup = () => uploadBtn.style.transform = 'scale(1)';
+            // Handle close
+            const handleClose = () => {{
+                overlay.style.opacity = '0';
+                setTimeout(() => {{
+                    overlay.remove();
+                    resolve(savedVoiceId);
+                }}, 300);
+            }};
+            closeBtn.onclick = handleClose;
+            overlay.onclick = (e) => {{
+                if (e.target === overlay) {{
+                    handleClose();
+                }}
+            }};
+            // Handle file upload
+            const statusDiv = popup.querySelector('#uploadStatus');
+            const fileInput = popup.querySelector('#voiceFile');
+            uploadBtn.onclick = async () => {{
+                const file = fileInput.files[0];
+                if (!file) {{
+                    statusDiv.textContent = 'Please select a file first.';
+                    statusDiv.style.color = '#e74c3c';
+                    return;
+                }}
+                const API_KEY = "{ELEVENLABS_API_KEY}";
+                statusDiv.textContent = 'Uploading...';
+                statusDiv.style.color = '#4a6f91';
+                uploadBtn.disabled = true;
+                uploadBtn.style.backgroundColor = '#6c8091';
+                const formData = new FormData();
+                formData.append('files', file);
+                formData.append('name', `voice_${{Date.now()}}`);
+                try {{
+                    const response = await fetch('https://api.elevenlabs.io/v1/voices/add', {{
+                        method: 'POST',
+                        headers: {{
+                            'Accept': 'application/json',
+                            'xi-api-key': API_KEY
+                        }},
+                        body: formData
+                    }});
+                    const result = await response.json();
+                    if (response.ok) {{
+                        savedVoiceId = result.voice_id
+                        statusDiv.innerHTML = `
+                            <div style="
+                                background-color: #2e3e50;
+                                color: #00b894;
+                                padding: 10px;
+                                border-radius: 6px;
+                                font-weight: 600;
+                            ">
+                                Voice uploaded successfully!
+                                <br>Your Voice ID: <span style="color: #0984e3;">${{result.voice_id}}</span>
+                            </div>
+                        `;
+                        // Update the visible HTML panel
+                        const voiceIdPanel = document.querySelector('#voice_id_panel');
+                        if (voiceIdPanel) {{
+                            voiceIdPanel.innerHTML = `<strong>Your voice_id from uploaded audio is </strong> <span style="color: #0984e3;">${{result.voice_id}}</span>`;
+                        }}
+                        setTimeout(() => {{
+                            overlay.style.opacity = '0';
+                            setTimeout(() => {{
+                                overlay.remove();
+                                resolve(result.voice_id);  // Resolve with the voice ID
+                            }}, 300);
+                        }}, 3000);
+                    }} else {{
+                        throw new Error(result.detail?.message || 'Upload failed');
+                    }}
+                }} catch (error) {{
+                    statusDiv.innerHTML = `
+                        <div style="
+                            background-color: #3b4c63;
+                            color: #d63031;
+                            padding: 10px;
+                            border-radius: 6px;
+                            font-weight: 600;
+                        ">
+                            Error: ${{error.message}}
+                        </div>
+                    `;
+                    uploadBtn.disabled = false;
+                    uploadBtn.style.backgroundColor = '#4a6f91';
+                }}
+            }};
+        }});
+        return result;  // Return the voice ID from the Promise
+    }} catch (error) {{
+        console.error('Error in createVoiceUploadPopup:', error);
+        return null;
+    }}
+}}
+"""
+EFFECT_CSS = """\
+<style>
+    .text-effect-container {
+        line-height: 1.6;
+    }
+    .character-segment {
+        border-radius: 0.2em;
+    }
+    .effect-container {
+        position: relative;
+        display: inline-block;
+    }
+    .effect-text {
+        border-radius: 13px;
+        border: 2px solid rgba(251, 224, 5, 0.91);
+        cursor: help;
+        color: rgba(53, 53, 53, 0.97) !important;
+        background-color: #ffffffd9;
+        font-size: 0.9em;
+        padding-left: 0.3em !important;
+        padding-right: 0.3em !important;
+    }
+    .effect-tooltip {
+        visibility: hidden;
+        background-color: #333;
+        color: white;
+        text-align: center;
+        padding: 5px 10px;
+        border-radius: 6px;
+        position: absolute;
+        z-index: 1;
+        bottom: 125%;
+        left: 50%;
+        transform: translateX(-50%);
+        white-space: nowrap;
+        opacity: 0;
+        transition: opacity 0.3s;
+    }
+    .effect-tooltip::after {
+        content: "";
+        position: absolute;
+        top: 100%;
+        left: 50%;
+        margin-left: -5px;
+        border-width: 5px;
+        border-style: solid;
+        border-color: #333 transparent transparent transparent;
+    }
+    .effect-container:hover .effect-tooltip {
+        visibility: visible;
+        opacity: 1;
+    }
+</style>
+"""