open-notebooklm

Configuration error

App Files Files Community

gabrielchua commited on 18 days ago

Commit

f136260

•

1 Parent(s): 9f31d5a

remove Parler-TTS Mini

Browse files

Files changed (2) hide show

app.py +9 -25
utils.py +16 -46

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from utils import generate_script, generate_audio, parse_url
 class DialogueItem(BaseModel):
     """A single dialogue item."""
-    speaker: Literal["Host (Jenna)", "Guest"]
     text: str
@@ -41,7 +41,6 @@ def generate_podcast(
     files: List[str],
     url: Optional[str],
     tone: Optional[str],
-    voice: Optional[str],
     length: Optional[str],
     language: str
 ) -> Tuple[str, str]:
@@ -58,12 +57,6 @@ def generate_podcast(
         "Korean": "KR",
     }
-    # Change voice to the appropriate code
-    voice_mapping = {
-        "Male": "Gary",
-        "Female": "Laura",
-    }
     # Check if at least one input is provided
     if not files and not url:
         raise gr.Error("Please provide at least one PDF file or a URL.")
@@ -116,16 +109,16 @@ def generate_podcast(
     total_characters = 0
     for line in llm_output.dialogue:
-        logger.info(f"Generating audio for {line.speaker}, {language} and {voice}: {line.text}")
-        if line.speaker == "Host (Jenna)":
-            speaker = f"**Jenna**: {line.text}"
         else:
             speaker = f"**{llm_output.name_of_guest}**: {line.text}"
         transcript += speaker + "\n\n"
         total_characters += len(line.text)
         # Get audio file path
-        audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language], voice_mapping[voice])
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
         audio_segments.append(audio_segment)
@@ -173,20 +166,15 @@ demo = gr.Interface(
             label="3. 🎭 Choose the tone",
             value="Fun"
         ),
-        gr.Radio(
-            choices=["Male", "Female"],
-            label="4. 🎭 Choose the guest's voice",
-            value="Female"
-        ),
         gr.Radio(
             choices=["Short (1-2 min)", "Medium (3-5 min)"],
-            label="5. ⏱️ Choose the length",
             value="Medium (3-5 min)"
         ),
         gr.Dropdown(
             choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
             value="English",
-            label="6. 🌐 Choose the language (Highly experimental, English is recommended)",
         ),
     ],
     outputs=[
@@ -202,15 +190,13 @@ demo = gr.Interface(
             [str(Path("examples/1310.4546v1.pdf"))],
             "",
             "Fun",
-            "Male",
-            "Medium (3-5 min)",
             "English"
         ],
         [
             [],
             "https://en.wikipedia.org/wiki/Hugging_Face",
             "Fun",
-            "Male",
             "Short (1-2 min)",
             "English"
         ],
@@ -218,14 +204,12 @@ demo = gr.Interface(
             [],
             "https://simple.wikipedia.org/wiki/Taylor_Swift",
             "Fun",
-            "Female",
             "Short (1-2 min)",
             "English"
         ],
     ],
     cache_examples=True,
-    examples_cache_dir="examples_cached"
 )
 if __name__ == "__main__":
-    demo.launch(show_api=True)

 class DialogueItem(BaseModel):
     """A single dialogue item."""
+    speaker: Literal["Host (Jane)", "Guest"]
     text: str
     files: List[str],
     url: Optional[str],
     tone: Optional[str],
     length: Optional[str],
     language: str
 ) -> Tuple[str, str]:
         "Korean": "KR",
     }
     # Check if at least one input is provided
     if not files and not url:
         raise gr.Error("Please provide at least one PDF file or a URL.")
     total_characters = 0
     for line in llm_output.dialogue:
+        logger.info(f"Generating audio for {line.speaker}: {line.text}")
+        if line.speaker == "Host (Jane)":
+            speaker = f"**Jane**: {line.text}"
         else:
             speaker = f"**{llm_output.name_of_guest}**: {line.text}"
         transcript += speaker + "\n\n"
         total_characters += len(line.text)
         # Get audio file path
+        audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
         audio_segments.append(audio_segment)
             label="3. 🎭 Choose the tone",
             value="Fun"
         ),
         gr.Radio(
             choices=["Short (1-2 min)", "Medium (3-5 min)"],
+            label="4. ⏱️ Choose the length",
             value="Medium (3-5 min)"
         ),
         gr.Dropdown(
             choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
             value="English",
+            label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
         ),
     ],
     outputs=[
             [str(Path("examples/1310.4546v1.pdf"))],
             "",
             "Fun",
+            "Short (1-2 min)",
             "English"
         ],
         [
             [],
             "https://en.wikipedia.org/wiki/Hugging_Face",
             "Fun",
             "Short (1-2 min)",
             "English"
         ],
             [],
             "https://simple.wikipedia.org/wiki/Taylor_Swift",
             "Fun",
             "Short (1-2 min)",
             "English"
         ],
     ],
     cache_examples=True,
 )
 if __name__ == "__main__":
+    demo.launch(show_api=True)

utils.py CHANGED Viewed

@@ -7,20 +7,12 @@ Functions:
 - get_audio: Get the audio from the TTS model from HF Spaces.
 """
-import os
 import requests
-import tempfile
-import soundfile as sf
-import spaces
-import torch
 from gradio_client import Client
 from openai import OpenAI
-from parler_tts import ParlerTTSForConditionalGeneration
 from pydantic import ValidationError
-from transformers import AutoTokenizer
 MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
 JINA_URL = "https://r.jina.ai/"
@@ -32,10 +24,6 @@ client = OpenAI(
 hf_client = Client("mrfakename/MeloTTS")
-# Initialize the model and tokenizer (do this outside the function for efficiency)
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
-tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 def generate_script(system_prompt: str, input_text: str, output_model):
     """Get the dialogue from the LLM."""
@@ -79,38 +67,20 @@ def parse_url(url: str) -> str:
     response = requests.get(full_url, timeout=60)
     return response.text
-def generate_audio(text: str, speaker: str, language: str, voice: str) -> str:
-    """Generate audio using the local Parler TTS model or HuggingFace client."""
-    if language == "EN":
-        # Adjust the description based on speaker and language
-        if speaker == "Guest":
-            description = f"{voice} has a slightly expressive and animated speech, speaking at a moderate speed with natural pitch variations. The voice is clear and close-up, as if recorded in a professional studio."
-        else:  # host
-            description = f"{voice} has a professional and engaging tone, speaking at a moderate to slightly faster pace. The voice is clear, warm, and sounds like a seasoned podcast host."
-        # Prepare inputs
-        input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-        prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
-        # Generate audio
-        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-        audio_arr = generation.cpu().numpy().squeeze()
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
-            sf.write(temp_file.name, audio_arr, model.config.sampling_rate, format='mp3')
-        return temp_file.name
-    else:
-        accent = language
-        if speaker == "Guest":
-            speed = 0.9
-        else:  # host
-            speed = 1.1
-        # Generate audio
-        result = hf_client.predict(
-            text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
-        )
-        return result

 - get_audio: Get the audio from the TTS model from HF Spaces.
 """
+import os
 import requests
 from gradio_client import Client
 from openai import OpenAI
 from pydantic import ValidationError
 MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
 JINA_URL = "https://r.jina.ai/"
 hf_client = Client("mrfakename/MeloTTS")
 def generate_script(system_prompt: str, input_text: str, output_model):
     """Get the dialogue from the LLM."""
     response = requests.get(full_url, timeout=60)
     return response.text
+def generate_audio(text: str, speaker: str, language: str) -> bytes:
+    """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
+    if speaker == "Guest":
+        accent = "EN-US" if language == "EN" else language
+        speed = 0.9
+    else:  # host
+        accent = "EN-Default" if language == "EN" else language
+        speed = 1
+    if language != "EN" and speaker != "Guest":
+        speed = 1.1
+    # Generate audio
+    result = hf_client.predict(
+        text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
+    )
+    return result