gabrielchua commited on
Commit
f17c34f
1 Parent(s): 6141c41

update app

Browse files
Files changed (3) hide show
  1. app.py +64 -27
  2. prompts.py +43 -31
  3. utils.py +19 -10
app.py CHANGED
@@ -13,7 +13,7 @@ from typing import List, Literal, Tuple, Optional
13
  # Third-party imports
14
  import gradio as gr
15
  from loguru import logger
16
- from pydantic import BaseModel
17
  from pypdf import PdfReader
18
  from pydub import AudioSegment
19
 
@@ -29,20 +29,29 @@ class DialogueItem(BaseModel):
29
  text: str
30
 
31
 
32
- class Dialogue(BaseModel):
33
  """The dialogue between the host and guest."""
34
 
35
  scratchpad: str
36
  name_of_guest: str
37
- dialogue: List[DialogueItem]
 
 
 
 
 
 
 
 
38
 
39
 
40
  def generate_podcast(
41
  files: List[str],
42
  url: Optional[str],
 
43
  tone: Optional[str],
44
  length: Optional[str],
45
- language: str
46
  ) -> Tuple[str, str]:
47
  """Generate the audio and transcript from the PDFs and/or URL."""
48
  text = ""
@@ -64,8 +73,10 @@ def generate_podcast(
64
  # Process PDFs if any
65
  if files:
66
  for file in files:
67
- if not file.lower().endswith('.pdf'):
68
- raise gr.Error(f"File {file} is not a PDF. Please upload only PDF files.")
 
 
69
 
70
  try:
71
  with Path(file).open("rb") as f:
@@ -84,10 +95,14 @@ def generate_podcast(
84
 
85
  # Check total character count
86
  if len(text) > 100000:
87
- raise gr.Error("The total content is too long. Please ensure the combined text from PDFs and URL is fewer than ~100,000 characters.")
88
-
89
- # Modify the system prompt based on the chosen tone and length
 
 
90
  modified_system_prompt = SYSTEM_PROMPT
 
 
91
  if tone:
92
  modified_system_prompt += f"\n\nTONE: The tone of the podcast should be {tone}."
93
  if length:
@@ -97,10 +112,15 @@ def generate_podcast(
97
  }
98
  modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
99
  if language:
100
- modified_system_prompt += f"\n\nOUTPUT LANGUAGE <IMPORTANT>: The the podcast should be {language}."
 
 
101
 
102
  # Call the LLM
103
- llm_output = generate_script(modified_system_prompt, text, Dialogue)
 
 
 
104
  logger.info(f"Generated dialogue: {llm_output}")
105
 
106
  # Process the dialogue
@@ -118,7 +138,9 @@ def generate_podcast(
118
  total_characters += len(line.text)
119
 
120
  # Get audio file path
121
- audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
 
 
122
  # Read the audio file into an AudioSegment
123
  audio_segment = AudioSegment.from_file(audio_file_path)
124
  audio_segments.append(audio_segment)
@@ -149,36 +171,48 @@ def generate_podcast(
149
 
150
  demo = gr.Interface(
151
  title="Open NotebookLM",
152
- description="Convert your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS). \n \n Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.",
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  fn=generate_podcast,
154
  inputs=[
155
  gr.File(
156
- label="1. 📄 Upload your PDF(s)",
157
- file_types=[".pdf"],
158
- file_count="multiple"
159
  ),
160
  gr.Textbox(
161
  label="2. 🔗 Paste a URL (optional)",
162
- placeholder="Enter a URL to include its content"
163
  ),
164
- gr.Radio(
 
165
  choices=["Fun", "Formal"],
166
- label="3. 🎭 Choose the tone",
167
  value="Fun"
168
  ),
169
- gr.Radio(
170
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
171
- label="4. ⏱️ Choose the length",
172
  value="Medium (3-5 min)"
173
  ),
174
  gr.Dropdown(
175
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
176
  value="English",
177
- label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
178
  ),
179
  ],
180
  outputs=[
181
- gr.Audio(label="Audio", format="mp3"),
182
  gr.Markdown(label="Transcript"),
183
  ],
184
  allow_flagging="never",
@@ -189,27 +223,30 @@ demo = gr.Interface(
189
  [
190
  [str(Path("examples/1310.4546v1.pdf"))],
191
  "",
 
192
  "Fun",
193
  "Short (1-2 min)",
194
- "English"
195
  ],
196
  [
197
  [],
198
  "https://en.wikipedia.org/wiki/Hugging_Face",
 
199
  "Fun",
200
  "Short (1-2 min)",
201
- "English"
202
  ],
203
  [
204
  [],
205
  "https://simple.wikipedia.org/wiki/Taylor_Swift",
 
206
  "Fun",
207
  "Short (1-2 min)",
208
- "English"
209
  ],
210
  ],
211
  cache_examples=True,
212
  )
213
 
214
  if __name__ == "__main__":
215
- demo.launch(show_api=True)
 
13
  # Third-party imports
14
  import gradio as gr
15
  from loguru import logger
16
+ from pydantic import BaseModel, Field
17
  from pypdf import PdfReader
18
  from pydub import AudioSegment
19
 
 
29
  text: str
30
 
31
 
32
+ class ShortDialogue(BaseModel):
33
  """The dialogue between the host and guest."""
34
 
35
  scratchpad: str
36
  name_of_guest: str
37
+ dialogue: List[DialogueItem] = Field(..., description="A list of dialogue items, typically between 5 to 9 items")
38
+
39
+
40
+ class MediumDialogue(BaseModel):
41
+ """The dialogue between the host and guest."""
42
+
43
+ scratchpad: str
44
+ name_of_guest: str
45
+ dialogue: List[DialogueItem] = Field(..., description="A list of dialogue items, typically between 8 to 13 items")
46
 
47
 
48
  def generate_podcast(
49
  files: List[str],
50
  url: Optional[str],
51
+ question: Optional[str],
52
  tone: Optional[str],
53
  length: Optional[str],
54
+ language: str,
55
  ) -> Tuple[str, str]:
56
  """Generate the audio and transcript from the PDFs and/or URL."""
57
  text = ""
 
73
  # Process PDFs if any
74
  if files:
75
  for file in files:
76
+ if not file.lower().endswith(".pdf"):
77
+ raise gr.Error(
78
+ f"File {file} is not a PDF. Please upload only PDF files."
79
+ )
80
 
81
  try:
82
  with Path(file).open("rb") as f:
 
95
 
96
  # Check total character count
97
  if len(text) > 100000:
98
+ raise gr.Error(
99
+ "The total content is too long. Please ensure the combined text from PDFs and URL is fewer than ~100,000 characters."
100
+ )
101
+
102
+ # Modify the system prompt based on the user input
103
  modified_system_prompt = SYSTEM_PROMPT
104
+ if question:
105
+ modified_system_prompt += f"\n\PLEASE ANSWER THE FOLLOWING QN: {question}"
106
  if tone:
107
  modified_system_prompt += f"\n\nTONE: The tone of the podcast should be {tone}."
108
  if length:
 
112
  }
113
  modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
114
  if language:
115
+ modified_system_prompt += (
116
+ f"\n\nOUTPUT LANGUAGE <IMPORTANT>: The the podcast should be {language}."
117
+ )
118
 
119
  # Call the LLM
120
+ if length == "Short (1-2 min)":
121
+ llm_output = generate_script(modified_system_prompt, text, ShortDialogue)
122
+ else:
123
+ llm_output = generate_script(modified_system_prompt, text, MediumDialogue)
124
  logger.info(f"Generated dialogue: {llm_output}")
125
 
126
  # Process the dialogue
 
138
  total_characters += len(line.text)
139
 
140
  # Get audio file path
141
+ audio_file_path = generate_audio(
142
+ line.text, line.speaker, language_mapping[language]
143
+ )
144
  # Read the audio file into an AudioSegment
145
  audio_segment = AudioSegment.from_file(audio_file_path)
146
  audio_segments.append(audio_segment)
 
171
 
172
  demo = gr.Interface(
173
  title="Open NotebookLM",
174
+ description="""
175
+
176
+ <table style="border-collapse: collapse; border: none; padding: 20px;">
177
+ <tr style="border: none;">
178
+ <td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
179
+ <img src="https://raw.githubusercontent.com/gabrielchua/open-notebooklm/main/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
180
+ </td>
181
+ <td style="border: none; vertical-align: top; padding: 10px;">
182
+ <p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
183
+ <p style="margin-top: 15px;">Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.</p>
184
+ </td>
185
+ </tr>
186
+ </table>
187
+ """,
188
  fn=generate_podcast,
189
  inputs=[
190
  gr.File(
191
+ label="1. 📄 Upload your PDF(s)", file_types=[".pdf"], file_count="multiple"
 
 
192
  ),
193
  gr.Textbox(
194
  label="2. 🔗 Paste a URL (optional)",
195
+ placeholder="Enter a URL to include its content",
196
  ),
197
+ gr.Textbox(label="3. 🤔 Do you have a specific question or topic in mind?"),
198
+ gr.Dropdown(
199
  choices=["Fun", "Formal"],
200
+ label="4. 🎭 Choose the tone",
201
  value="Fun"
202
  ),
203
+ gr.Dropdown(
204
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
205
+ label="5. ⏱️ Choose the length",
206
  value="Medium (3-5 min)"
207
  ),
208
  gr.Dropdown(
209
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
210
  value="English",
211
+ label="6. 🌐 Choose the language"
212
  ),
213
  ],
214
  outputs=[
215
+ gr.Audio(label="Podcast", format="mp3"),
216
  gr.Markdown(label="Transcript"),
217
  ],
218
  allow_flagging="never",
 
223
  [
224
  [str(Path("examples/1310.4546v1.pdf"))],
225
  "",
226
+ "Explain this paper to me like I'm 5 years old",
227
  "Fun",
228
  "Short (1-2 min)",
229
+ "English",
230
  ],
231
  [
232
  [],
233
  "https://en.wikipedia.org/wiki/Hugging_Face",
234
+ "How did Hugging Face become so successful?",
235
  "Fun",
236
  "Short (1-2 min)",
237
+ "English",
238
  ],
239
  [
240
  [],
241
  "https://simple.wikipedia.org/wiki/Taylor_Swift",
242
+ "Why is Taylor Swift so popular?",
243
  "Fun",
244
  "Short (1-2 min)",
245
+ "English",
246
  ],
247
  ],
248
  cache_examples=True,
249
  )
250
 
251
  if __name__ == "__main__":
252
+ demo.launch(show_api=True)
prompts.py CHANGED
@@ -3,41 +3,53 @@ prompts.py
3
  """
4
 
5
  SYSTEM_PROMPT = """
6
- You are a world-class podcast producer.
7
- Your task is to transform the provided input text into an engaging and informative podcast script.
8
- You will receive as input a text that may be unstructured or messy, sourced from places like PDFs or web pages. Ignore irrelevant information or formatting issues. Y
9
- Your focus is on extracting the most interesting and insightful content for a podcast discussion.
10
 
11
  # Steps to Follow:
12
 
13
  1. **Analyze the Input:**
14
- Carefully read the input text. Identify the key topics, points, and any interesting facts or anecdotes that could drive a compelling podcast conversation.
15
 
16
  2. **Brainstorm Ideas:**
17
- In the `<scratchpad>`, brainstorm creative ways to present the key points in an engaging manner. Think of analogies, storytelling techniques, or hypothetical scenarios to make the content relatable and entertaining for listeners.
18
-
19
- - Keep the discussion accessible to a general audience. Avoid jargon and briefly explain complex concepts in simple terms.
20
- - Use imagination to fill in any gaps or create thought-provoking questions to explore during the podcast.
21
- - Your aim is to create an entertaining and informative podcast, so feel free to be creative with your approach.
22
-
23
- 3. **Write the Dialogue:**
24
- Now, develop the podcast dialogue. Aim for a natural, conversational flow between the host (named Jane) and the guest speaker (the author of the input text, if mentioned).
25
-
26
- - Use the best ideas from your brainstorming session.
27
- - Ensure complex topics are explained clearly and simply.
28
- - Focus on maintaining an engaging and lively tone that would captivate listeners.
29
- - Rules:
30
- > The host ALWAYS goes first and is interviewing the guest. The guest is the one who explains the topic.
31
- > The host should ask the guest questions.
32
- > The host should summarize the key insights at the end.
33
- > Include common verbal fillers like "uhms" and "errs" in the host and guests response. This is so the script is realistic.
34
- > The host and guest can interrupt each other.
35
- > The guest must NOT include marketing or self-promotional content.
36
- > The guest must NOT include any material NOT substantiated within the input text.
37
- > This is to be a PG conversation.
38
-
39
- 4. **Wrap it Up:**
40
- At the end of the dialogue, the host and guest should naturally summarize the key insights. This should feel like a casual conversation, rather than a formal recap, reinforcing the main points one last time before signing off.
41
-
42
- ALWAYS REPLY IN VALID JSON, AND NO CODE BLOCKS. BEGIN DIRECTLY WITH THE JSON OUTPUT.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
 
3
  """
4
 
5
  SYSTEM_PROMPT = """
6
+ You are a world-class podcast producer tasked with transforming the provided input text into an engaging and informative podcast script. The input may be unstructured or messy, sourced from PDFs or web pages. Your goal is to extract the most interesting and insightful content for a compelling podcast discussion.
 
 
 
7
 
8
  # Steps to Follow:
9
 
10
  1. **Analyze the Input:**
11
+ Carefully examine the text, identifying key topics, points, and interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant information or formatting issues.
12
 
13
  2. **Brainstorm Ideas:**
14
+ In the `<scratchpad>`, creatively brainstorm ways to present the key points engagingly. Consider:
15
+ - Analogies, storytelling techniques, or hypothetical scenarios to make content relatable
16
+ - Ways to make complex topics accessible to a general audience
17
+ - Thought-provoking questions to explore during the podcast
18
+ - Creative approaches to fill any gaps in the information
19
+
20
+ 3. **Craft the Dialogue:**
21
+ Develop a natural, conversational flow between the host (Jane) and the guest speaker (the author or an expert on the topic). Incorporate:
22
+ - The best ideas from your brainstorming session
23
+ - Clear explanations of complex topics
24
+ - An engaging and lively tone to captivate listeners
25
+ - A balance of information and entertainment
26
+
27
+ Rules for the dialogue:
28
+ - The host (Jane) always initiates the conversation and interviews the guest
29
+ - Include thoughtful questions from the host to guide the discussion
30
+ - Incorporate natural speech patterns, including occasional verbal fillers (e.g., "um," "well," "you know")
31
+ - Allow for natural interruptions and back-and-forth between host and guest
32
+ - Ensure the guest's responses are substantiated by the input text, avoiding unsupported claims
33
+ - Maintain a PG-rated conversation appropriate for all audiences
34
+ - Avoid any marketing or self-promotional content from the guest
35
+ - The host concludes the conversation
36
+
37
+ 4. **Summarize Key Insights:**
38
+ Naturally weave a summary of key points into the closing part of the dialogue. This should feel like a casual conversation rather than a formal recap, reinforcing the main takeaways before signing off.
39
+
40
+ 5. **Maintain Authenticity:**
41
+ Throughout the script, strive for authenticity in the conversation. Include:
42
+ - Moments of genuine curiosity or surprise from the host
43
+ - Instances where the guest might briefly struggle to articulate a complex idea
44
+ - Light-hearted moments or humor when appropriate
45
+ - Brief personal anecdotes or examples that relate to the topic (within the bounds of the input text)
46
+
47
+ 6. **Consider Pacing and Structure:**
48
+ Ensure the dialogue has a natural ebb and flow:
49
+ - Start with a strong hook to grab the listener's attention
50
+ - Gradually build complexity as the conversation progresses
51
+ - Include brief "breather" moments for listeners to absorb complex information
52
+ - End on a high note, perhaps with a thought-provoking question or a call-to-action for listeners
53
+
54
+ Remember: Always reply in valid JSON format, without code blocks. Begin directly with the JSON output.
55
  """
utils.py CHANGED
@@ -30,17 +30,22 @@ def generate_script(system_prompt: str, input_text: str, output_model):
30
  # Load as python object
31
  try:
32
  response = call_llm(system_prompt, input_text, output_model)
33
- dialogue = output_model.model_validate_json(
34
- response.choices[0].message.content
35
- )
36
  except ValidationError as e:
37
  error_message = f"Failed to parse dialogue JSON: {e}"
38
  system_prompt_with_error = f"{system_prompt}\n\nPlease return a VALID JSON object. This was the earlier error: {error_message}"
39
  response = call_llm(system_prompt_with_error, input_text, output_model)
40
- dialogue = output_model.model_validate_json(
41
- response.choices[0].message.content
42
- )
43
- return dialogue
 
 
 
 
 
 
 
44
 
45
 
46
  def call_llm(system_prompt: str, text: str, dialogue_format):
@@ -78,9 +83,13 @@ def generate_audio(text: str, speaker: str, language: str) -> bytes:
78
  speed = 1
79
  if language != "EN" and speaker != "Guest":
80
  speed = 1.1
81
-
82
  # Generate audio
83
  result = hf_client.predict(
84
- text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
 
 
 
 
85
  )
86
- return result
 
30
  # Load as python object
31
  try:
32
  response = call_llm(system_prompt, input_text, output_model)
33
+ dialogue = output_model.model_validate_json(response.choices[0].message.content)
 
 
34
  except ValidationError as e:
35
  error_message = f"Failed to parse dialogue JSON: {e}"
36
  system_prompt_with_error = f"{system_prompt}\n\nPlease return a VALID JSON object. This was the earlier error: {error_message}"
37
  response = call_llm(system_prompt_with_error, input_text, output_model)
38
+ dialogue = output_model.model_validate_json(response.choices[0].message.content)
39
+
40
+ # Call the LLM again to improve the dialogue
41
+ system_prompt_with_dialogue = f"{system_prompt}\n\nHere is the first draft of the dialogue you provided:\n\n{dialogue}."
42
+ response = call_llm(
43
+ system_prompt_with_dialogue, "Please improve the dialogue.", output_model
44
+ )
45
+ improved_dialogue = output_model.model_validate_json(
46
+ response.choices[0].message.content
47
+ )
48
+ return improved_dialogue
49
 
50
 
51
  def call_llm(system_prompt: str, text: str, dialogue_format):
 
83
  speed = 1
84
  if language != "EN" and speaker != "Guest":
85
  speed = 1.1
86
+
87
  # Generate audio
88
  result = hf_client.predict(
89
+ text=text,
90
+ language=language,
91
+ speaker=accent,
92
+ speed=speed,
93
+ api_name="/synthesize",
94
  )
95
+ return result