Tanmay Jain commited on
Commit
c6fc13f
·
1 Parent(s): 94bf4c8

init commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ __pycache__
classes/edge_tts_generator.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # classes/edge_tts_generator.py
2
+
3
+ import asyncio
4
+ import pickle
5
+ import re
6
+ from tqdm import tqdm
7
+ import ast
8
+ import edge_tts
9
+
10
+ class EdgeTTSGenerator:
11
+ """
12
+ A class to generate podcast-style audio from a transcript using edge-tts.
13
+ """
14
+ def __init__(self, transcript_file_path, output_audio_path):
15
+ """
16
+ Initialize the TTS generator with the path to the rewritten transcript file.
17
+
18
+ Args:
19
+ transcript_file_path (str): Path to the file containing the rewritten transcript.
20
+ output_audio_path (str): Path to save the generated audio file.
21
+ """
22
+ self.transcript_file_path = transcript_file_path
23
+ self.output_audio_path = output_audio_path
24
+
25
+ # Speaker descriptions for edge-tts voices
26
+ self.speaker1_voice = "en-US-AriaNeural"
27
+ self.speaker2_voice = "en-US-GuyNeural"
28
+
29
+ def load_transcript(self):
30
+ """
31
+ Loads the rewritten transcript from the specified file.
32
+
33
+ Returns:
34
+ list: The content of the transcript as a list of tuples (speaker, text).
35
+ """
36
+ with open(self.transcript_file_path, 'rb') as f:
37
+ return ast.literal_eval(pickle.load(f))
38
+
39
+ async def generate_audio_segment(self, text, voice_name):
40
+ """
41
+ Generate audio for a given text using edge-tts.
42
+
43
+ Args:
44
+ text (str): Text to be synthesized.
45
+ voice_name (str): The voice name to use for TTS.
46
+
47
+ Returns:
48
+ bytes: Generated audio data.
49
+ """
50
+ communicator = edge_tts.Communicate(text, voice_name)
51
+ audio_bytes = b""
52
+ async for chunk in communicator.stream():
53
+ if "data" in chunk: # Check if 'data' exists in chunk
54
+ audio_bytes += chunk["data"] # Concatenate only the audio data
55
+ return audio_bytes
56
+
57
+ def save_audio(self, audio_data):
58
+ """
59
+ Save the combined audio data to an output file.
60
+
61
+ Args:
62
+ audio_data (list): List of bytes containing the audio data for each segment.
63
+ """
64
+ combined_audio = b"".join(audio_data)
65
+ with open(self.output_audio_path, "wb") as f:
66
+ f.write(combined_audio)
67
+
68
+ async def generate_audio(self):
69
+ """
70
+ Converts the transcript into audio and saves it to a file.
71
+
72
+ Returns:
73
+ str: Path to the saved audio file.
74
+ """
75
+ transcript = self.load_transcript()
76
+ audio_data = []
77
+
78
+ for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
79
+ voice = self.speaker1_voice if speaker == "Speaker 1" else self.speaker2_voice
80
+ segment_audio = await self.generate_audio_segment(text, voice)
81
+ audio_data.append(segment_audio)
82
+
83
+ self.save_audio(audio_data)
84
+ return self.output_audio_path
classes/pdf_text_extractor.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # classes/pdf_text_extractor.py
2
+
3
+ import os
4
+ import openai
5
+ from PyPDF2 import PdfReader
6
+ import re
7
+ from tqdm import tqdm
8
+
9
+ from prompts import PDF_SYSTEM_PROMPT
10
+ from config import llm_configs
11
+ import time
12
+
13
+ class PDFTextExtractor:
14
+ """
15
+ A class to handle PDF text extraction and preprocessing for podcast preparation.
16
+ """
17
+ def __init__(self, pdf_path, output_path, model_name="llama3-8b-8192", llm_config=None, max_chars=100000, chunk_size=1000):
18
+ """
19
+ Initialize the PDFTextExtractor with paths and model details.
20
+
21
+ Args:
22
+ pdf_path (str): Path to the PDF file.
23
+ output_path (str): Path to save the cleaned text file.
24
+ model_name (str): Name of the model to use for text processing.
25
+ llm_config (dict): Configuration for the LLM.
26
+ max_chars (int): Maximum number of characters to process from the PDF.
27
+ chunk_size (int): Size of text chunks to process at a time.
28
+ """
29
+ self.pdf_path = pdf_path
30
+ self.output_path = output_path
31
+ self.max_chars = max_chars
32
+ self.chunk_size = chunk_size
33
+ self.model_name = model_name
34
+ self.llm_config = llm_config or llm_configs.get(model_name)
35
+
36
+ if self.llm_config is None:
37
+ raise ValueError(f"Model configuration for {model_name} not found in llm_configs.")
38
+
39
+ # System prompt for text processing
40
+ self.system_prompt = PDF_SYSTEM_PROMPT
41
+
42
+ def create_client(self):
43
+ openai.api_key = self.llm_config["api_key"]
44
+ openai.api_base = self.llm_config["base_url"]
45
+ return openai
46
+
47
+ def validate_pdf(self):
48
+ """Check if the file exists and is a valid PDF."""
49
+ if not os.path.exists(self.pdf_path):
50
+ print(f"Error: File not found at path: {self.pdf_path}")
51
+ return False
52
+ if not self.pdf_path.lower().endswith('.pdf'):
53
+ print("Error: File is not a PDF")
54
+ return False
55
+ return True
56
+
57
+ def extract_text(self):
58
+ """Extract text from the PDF, limited by max_chars."""
59
+ if not self.validate_pdf():
60
+ return None
61
+
62
+ with open(self.pdf_path, 'rb') as file:
63
+ pdf_reader = PdfReader(file)
64
+ num_pages = len(pdf_reader.pages)
65
+ print(f"Processing PDF with {num_pages} pages...")
66
+
67
+ extracted_text = []
68
+ total_chars = 0
69
+
70
+ for page_num in range(num_pages):
71
+ page = pdf_reader.pages[page_num]
72
+ text = page.extract_text() or ""
73
+
74
+ if total_chars + len(text) > self.max_chars:
75
+ remaining_chars = self.max_chars - total_chars
76
+ extracted_text.append(text[:remaining_chars])
77
+ print(f"Reached {self.max_chars} character limit at page {page_num + 1}")
78
+ break
79
+
80
+ extracted_text.append(text)
81
+ total_chars += len(text)
82
+ print(f"Processed page {page_num + 1}/{num_pages}")
83
+
84
+ final_text = '\n'.join(extracted_text)
85
+ print(f"Extraction complete! Total characters: {len(final_text)}")
86
+ return final_text
87
+
88
+ def create_word_bounded_chunks(self, text):
89
+ """Split text into chunks around the target size."""
90
+ words = text.split()
91
+ chunks = []
92
+ current_chunk = []
93
+ current_length = 0
94
+
95
+ for word in words:
96
+ word_length = len(word) + 1 # +1 for the space
97
+ if current_length + word_length > self.chunk_size and current_chunk:
98
+ chunks.append(' '.join(current_chunk))
99
+ current_chunk = [word]
100
+ current_length = word_length
101
+ else:
102
+ current_chunk.append(word)
103
+ current_length += word_length
104
+
105
+ if current_chunk:
106
+ chunks.append(' '.join(current_chunk))
107
+
108
+ return chunks
109
+
110
+ def process_chunk(self, text_chunk):
111
+ """Process a text chunk with the model and return the cleaned text."""
112
+ conversation = [
113
+ {"role": "system", "content": self.system_prompt},
114
+ {"role": "user", "content": text_chunk}
115
+ ]
116
+ client = self.create_client()
117
+
118
+ response = client.ChatCompletion.create(
119
+ model=self.model_name,
120
+ messages=conversation,
121
+ )
122
+
123
+ processed_text = response.choices[0].message.content
124
+
125
+ return processed_text
126
+
127
+ def clean_and_save_text(self):
128
+ """Extract, clean, and save processed text to a file."""
129
+ extracted_text = self.extract_text()
130
+ if not extracted_text:
131
+ return None
132
+
133
+ chunks = self.create_word_bounded_chunks(extracted_text)
134
+ processed_text = ""
135
+
136
+ with open(self.output_path, 'w', encoding='utf-8') as out_file:
137
+ for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
138
+ processed_chunk = self.process_chunk(chunk)
139
+ processed_text += processed_chunk + "\n"
140
+ out_file.write(processed_chunk + "\n")
141
+ out_file.flush()
142
+ time.sleep(3) # To avoid rate limiting
143
+
144
+ print(f"\nExtracted and cleaned text has been saved to {self.output_path}")
145
+ return self.output_path
classes/transcript_processor.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # classes/transcript_processor.py
2
+
3
+ import os
4
+ import openai
5
+ import pickle
6
+ import re
7
+
8
+ from prompts import TRANSCRIPT_PROMPT, REWRITE_PROMPT
9
+ from config import llm_configs
10
+
11
+ class TranscriptProcessor:
12
+ """
13
+ A class to generate and rewrite podcast-style transcripts using a specified language model.
14
+ """
15
+
16
+ def __init__(self, text_file_path, transcript_output_path, tts_output_path, model_name="llama3-70b-8192", llm_config=None):
17
+ """
18
+ Initialize with the path to the cleaned text file and the model name.
19
+
20
+ Args:
21
+ text_file_path (str): Path to the file containing cleaned PDF text.
22
+ transcript_output_path (str): Path to save the generated transcript.
23
+ tts_output_path (str): Path to save the rewritten transcript for TTS.
24
+ model_name (str): Name of the language model to use.
25
+ llm_config (dict): Configuration for the LLM.
26
+ """
27
+ self.text_file_path = text_file_path
28
+ self.transcript_output_path = transcript_output_path
29
+ self.tts_output_path = tts_output_path
30
+ self.model_name = model_name
31
+ self.llm_config = llm_config or llm_configs.get(model_name)
32
+
33
+ if self.llm_config is None:
34
+ raise ValueError(f"Model configuration for {model_name} not found in llm_configs.")
35
+
36
+ self.transcript_prompt = TRANSCRIPT_PROMPT
37
+ self.rewrite_prompt = REWRITE_PROMPT
38
+
39
+ def create_client(self):
40
+ openai.api_key = self.llm_config["api_key"]
41
+ openai.api_base = self.llm_config["base_url"]
42
+ return openai
43
+
44
+ def load_text(self):
45
+ """
46
+ Reads the cleaned text file and returns its content.
47
+
48
+ Returns:
49
+ str: Content of the cleaned text file.
50
+ """
51
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
52
+ for encoding in encodings:
53
+ try:
54
+ with open(self.text_file_path, 'r', encoding=encoding) as file:
55
+ content = file.read()
56
+ print(f"Successfully read file using {encoding} encoding.")
57
+ return content
58
+ except (UnicodeDecodeError, FileNotFoundError):
59
+ continue
60
+ print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
61
+ return None
62
+
63
+ def generate_transcript(self):
64
+ """
65
+ Generates a podcast-style transcript and saves it as a pickled file.
66
+
67
+ Returns:
68
+ str: Path to the file where the transcript is saved.
69
+ """
70
+ input_text = self.load_text()
71
+ if input_text is None:
72
+ return None
73
+
74
+ messages = [
75
+ {"role": "system", "content": self.transcript_prompt},
76
+ {"role": "user", "content": input_text}
77
+ ]
78
+
79
+ client = self.create_client()
80
+
81
+ response = client.ChatCompletion.create(
82
+ model=self.model_name,
83
+ messages=messages,
84
+ )
85
+
86
+ transcript = response.choices[0].message.content
87
+
88
+ # Save the transcript as a pickle file
89
+ with open(self.transcript_output_path, 'wb') as f:
90
+ pickle.dump(transcript, f)
91
+
92
+ return self.transcript_output_path
93
+
94
+ def extract_tuple(self, text):
95
+ match = re.search(r'\[.*\]', text, re.DOTALL)
96
+ if match:
97
+ return match.group(0)
98
+ return None
99
+
100
+ def rewrite_transcript(self):
101
+ """
102
+ Refines the transcript for TTS, adding expressive elements and saving as a list of tuples.
103
+
104
+ Returns:
105
+ str: Path to the file where the TTS-ready transcript is saved.
106
+ """
107
+ # Load the initial generated transcript
108
+ with open(self.transcript_output_path, 'rb') as file:
109
+ input_transcript = pickle.load(file)
110
+
111
+ messages = [
112
+ {"role": "system", "content": self.rewrite_prompt},
113
+ {"role": "user", "content": input_transcript}
114
+ ]
115
+
116
+ client = self.create_client()
117
+
118
+ response = client.ChatCompletion.create(
119
+ model=self.model_name,
120
+ messages=messages,
121
+ )
122
+
123
+ rewritten_transcript = self.extract_tuple(response.choices[0].message.content)
124
+
125
+ # Save the rewritten transcript as a pickle file
126
+ with open(self.tts_output_path, 'wb') as f:
127
+ pickle.dump(rewritten_transcript, f)
128
+
129
+ return self.tts_output_path
config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ import os
4
+
5
+ llm_configs = {
6
+ # Mistral Models
7
+ "mistral-large-latest": {
8
+ "base_url": "https://api.mistral.ai/v1",
9
+ "api_key": os.environ.get("MISTRAL_API_KEY"),
10
+ "provider": "mistral",
11
+ "version": "mistral-large-2407"
12
+ },
13
+ "mistral-small-latest": {
14
+ "base_url": "https://api.mistral.ai/v1",
15
+ "api_key": os.environ.get("MISTRAL_API_KEY"),
16
+ "provider": "mistral",
17
+ "version": "mistral-small-2409"
18
+ },
19
+ "open-mistral-nemo": {
20
+ "base_url": "https://api.mistral.ai/v1",
21
+ "api_key": os.environ.get("MISTRAL_API_KEY"),
22
+ "provider": "mistral",
23
+ "version": "open-mistral-nemo-2407"
24
+ },
25
+
26
+ # Groq Models
27
+ "llama-3.1-70b-versatile": {
28
+ "base_url": "https://api.groq.com/openai/v1",
29
+ "api_key": os.environ.get("GROQ_API_KEY"),
30
+ "provider": "groq"
31
+ },
32
+ "mixtral-8x7b-32768": {
33
+ "base_url": "https://api.groq.com/openai/v1",
34
+ "api_key": os.environ.get("GROQ_API_KEY"),
35
+ "provider": "groq"
36
+ },
37
+ "llama3-70b-8192": {
38
+ "base_url": "https://api.groq.com/openai/v1",
39
+ "api_key": os.environ.get("GROQ_API_KEY"),
40
+ "provider": "groq"
41
+ },
42
+
43
+ # Grok Models
44
+ "grok-beta": {
45
+ "base_url": "https://api.x.ai/v1",
46
+ "api_key": os.environ.get("GROK_API_KEY"),
47
+ "provider": "grok",
48
+ "context_window": 131072,
49
+ "pricing": {
50
+ "input": 5, # per 131,072 tokens
51
+ "output": 15 # per 131,072 tokens
52
+ }
53
+ }
54
+ }
main.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #main.py
2
+
3
+ import gradio as gr
4
+ import os
5
+ import shutil
6
+ import asyncio
7
+ import tempfile
8
+ import pickle
9
+ import traceback # Import traceback for detailed error messages
10
+
11
+ from classes.pdf_text_extractor import PDFTextExtractor
12
+ from classes.transcript_processor import TranscriptProcessor
13
+ from classes.edge_tts_generator import EdgeTTSGenerator
14
+
15
+ from config import llm_configs
16
+
17
+ def create_temp_session_directory():
18
+ return tempfile.mkdtemp()
19
+
20
+ def process_pdf_to_podcast(pdf_file, model_name, max_chars=100000, chunk_size=1000):
21
+ try:
22
+ session_dir = create_temp_session_directory()
23
+
24
+ pdf_path = os.path.join(session_dir, "uploaded_pdf.pdf")
25
+ clean_text_path = os.path.join(session_dir, "clean_text.txt")
26
+ transcript_path = os.path.join(session_dir, "data.pkl")
27
+ tts_ready_path = os.path.join(session_dir, "podcast_ready_data.pkl")
28
+ audio_output_path = os.path.join(session_dir, "final_podcast_audio.mp3")
29
+
30
+ shutil.copy(pdf_file.name, pdf_path)
31
+
32
+ llm_config = llm_configs.get(model_name)
33
+ if llm_config is None:
34
+ return f"Model {model_name} not found in configuration.", None, None, None, None
35
+
36
+ extractor = PDFTextExtractor(pdf_path, clean_text_path, model_name=model_name, llm_config=llm_config, max_chars=max_chars, chunk_size=chunk_size)
37
+ clean_text_path = extractor.clean_and_save_text()
38
+
39
+ with open(clean_text_path, 'r', encoding='utf-8') as file:
40
+ text_preview = file.read(500)
41
+
42
+ processor = TranscriptProcessor(clean_text_path, transcript_path, tts_ready_path, model_name=model_name, llm_config=llm_config)
43
+ transcript_path = processor.generate_transcript()
44
+
45
+ with open(transcript_path, 'rb') as f:
46
+ transcript_preview = pickle.load(f)
47
+
48
+ tts_ready_path = processor.rewrite_transcript()
49
+
50
+ with open(tts_ready_path, 'rb') as f:
51
+ tts_ready_preview = pickle.load(f)
52
+
53
+ return (
54
+ "Steps 1-3 completed successfully. Preview and adjust the rewritten transcript if needed.",
55
+ text_preview,
56
+ transcript_preview,
57
+ tts_ready_preview,
58
+ session_dir
59
+ )
60
+ except Exception as e:
61
+ error_message = f"An error occurred during processing: {str(e)}"
62
+ # Optionally, include traceback for debugging (comment out in production)
63
+ # error_message += "\n" + traceback.format_exc()
64
+ return error_message, None, None, None, None
65
+
66
+ def generate_audio_from_modified_text(tts_ready_text, session_dir):
67
+ try:
68
+ if not session_dir:
69
+ session_dir = create_temp_session_directory()
70
+
71
+ tts_ready_path = os.path.join(session_dir, "podcast_ready_data.pkl")
72
+ audio_output_path = os.path.join(session_dir, "final_podcast_audio.mp3")
73
+
74
+ with open(tts_ready_path, 'wb') as f:
75
+ pickle.dump(tts_ready_text, f)
76
+
77
+ tts_gen = EdgeTTSGenerator(tts_ready_path, audio_output_path)
78
+ audio_path = asyncio.run(tts_gen.generate_audio())
79
+ return "Step 4 completed successfully. Audio saved.", audio_path
80
+ except Exception as e:
81
+ error_message = f"An error occurred during audio generation: {str(e)}"
82
+ # Optionally, include traceback for debugging (comment out in production)
83
+ # error_message += "\n" + traceback.format_exc()
84
+ return error_message, None
85
+
86
+ # Gradio Interface with Informative Descriptions and Multi-page Layout
87
+ custom_theme = gr.themes.Default(
88
+ primary_hue="purple",
89
+ secondary_hue="purple",
90
+ ).set(
91
+ button_primary_background_fill="#6A0DAD", # Deep purple for primary button
92
+ button_primary_background_fill_hover="#8B5FBF", # Lighter purple on hover
93
+ button_primary_border_color="#6A0DAD", # Deep purple for border color
94
+ button_primary_border_color_hover="#8B5FBF", # Lighter purple on hover
95
+ checkbox_background_color="#4B0082", # Indigo for checkboxes
96
+ checkbox_background_color_hover="#7D3F98", # Slightly lighter purple on hover
97
+ )
98
+
99
+ with gr.Blocks(theme=custom_theme) as app:
100
+ gr.Markdown("# AI Research Companion - Transforming Papers into Podcasts")
101
+ gr.Markdown("Harnessing AI to make research more accessible and effortless, by converting complex papers into engaging audio experiences.")
102
+ # Page 1: Project Overview and PDF Upload
103
+ with gr.Tab("Overview and Upload"):
104
+ gr.Markdown("""
105
+
106
+ ## Project Background
107
+ This project was initially implemented during the Smart India Hackathon (SIH) to address a real struggle I faced: managing the overwhelming flow of research papers and effectively understanding each one. The intensity of this process highlighted how valuable an AI-powered solution could be, not just for me but for others facing similar challenges in academia. By using large language models, this tool aims to make academic material more accessible and manageable, converting dense research into an audio format that’s easier to consume. And with the power of AI, I hope that this tool can transform the way we learn and engage with academic content.
108
+
109
+ Development is still ongoing, with plans to integrate web search capabilities and explore additional TTS engines to enhance usability. Special thanks to [yasserrmd](https://huggingface.co/spaces/yasserrmd/NotebookLlama) for inspiring the structured prompts that drive this project forward.
110
+
111
+ This AI Research Companion is crafted to bridge the gap between research and accessibility, turning in-depth research papers into audio podcasts for easier, on-the-go learning.
112
+ This page allows users to upload their research papers in PDF format to initiate the conversion process.
113
+ """)
114
+
115
+ with gr.Row():
116
+ pdf_input = gr.File(label="Upload PDF", type='filepath')
117
+ text_model = gr.Dropdown(
118
+ label="Select Text Model",
119
+ choices=list(llm_configs.keys()),
120
+ value="llama3-70b-8192"
121
+ )
122
+ max_chars = gr.Number(label="Max Characters to Process", value=100000, maximum=100000)
123
+ chunk_size = gr.Number(label="Chunk Size", value=1000)
124
+ run_all_button = gr.Button("Process Document")
125
+ output_status = gr.Textbox(label="Status", interactive=False, lines=5)
126
+ # Page 2: Preview Extracted Text
127
+ with gr.Tab("Text Extraction"):
128
+ gr.Markdown("""
129
+ ## Text Extraction
130
+ At this stage, your research paper’s content is carefully extracted, setting the foundation for its transformation into an audio-friendly format.
131
+ This extracted text will be used to generate a transcript and prepare it for text-to-speech (TTS) conversion.
132
+ """)
133
+ extracted_text_preview = gr.Textbox(label="Extracted Text Preview (First 500 Characters)", interactive=False, lines=10)
134
+ # Page 3: Generate Transcript
135
+ with gr.Tab("Transcript Generation"):
136
+ gr.Markdown("""
137
+ ## Transcript Generation
138
+ Here, the extracted text is structured into a clean, readable transcript, perfect for creating clear audio and adjusting any finer details.
139
+ This transcript can be modified before proceeding to the next step for audio generation. And fix any other errors left by the large language model.
140
+ """)
141
+ transcript_preview = gr.Textbox(label="Generated Transcript Preview", interactive=False, lines=10)
142
+ # Page 4: Edit TTS-ready Transcript
143
+ with gr.Tab("Edit Transcript for TTS"):
144
+ gr.Markdown("""
145
+ ## Edit Transcript for TTS
146
+ This refined transcript is ready for a final polish, ensuring it’s clear and precise before creating an audio experience.
147
+ Users can make final adjustments to the text here to ensure accuracy and coherence before audio generation.
148
+ """)
149
+ tts_ready_preview = gr.Textbox(label="Editable Rewritten Transcript for TTS", interactive=True, lines=10)
150
+ generate_audio_button = gr.Button("Generate Audio from Edited Transcript")
151
+ # Page 5: Listen to Generated Podcast Audio
152
+ with gr.Tab("Audio Output"):
153
+ gr.Markdown("""
154
+ ## Audio Output
155
+ Your transformed audio is now ready! Listen to your research in a podcast-like format, perfect for accessible and engaging learning on-the-go.
156
+ """)
157
+ final_audio_output = gr.Audio(label="Generated Podcast Audio")
158
+
159
+ session_dir = gr.State()
160
+ # Execute Steps 1-3: Upload, Process, Extract
161
+ run_all_button.click(
162
+ process_pdf_to_podcast,
163
+ inputs=[pdf_input, text_model, max_chars, chunk_size],
164
+ outputs=[output_status, extracted_text_preview, transcript_preview, tts_ready_preview, session_dir]
165
+ )
166
+ # Step 4: Generate Audio from Edited Transcript
167
+ generate_audio_button.click(
168
+ generate_audio_from_modified_text,
169
+ inputs=[tts_ready_preview, session_dir],
170
+ outputs=[output_status, final_audio_output]
171
+ )
172
+
173
+ app.launch()
prompts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompts.py
2
+ PDF_SYSTEM_PROMPT = """
3
+ As a highly skilled text pre-processor, your job is to take raw data from a PDF and transform it into a polished format that a podcast writer can easily utilize.
4
+
5
+ The raw data may include jumbled line breaks, LaTeX equations, and irrelevant filler content. Your objective is to refine this content, removing anything that doesn’t add value to a podcast transcript.
6
+
7
+ Remember, the podcast could cover any topic, so keep an open mind about what might be unnecessary.
8
+
9
+ Be thoughtful and creative in your editing process.
10
+
11
+ IMPORTANT: DO NOT START WITH A SUMMARY; YOUR FOCUS IS SOLELY ON CLEANING UP AND REWRITING THE TEXT AS NEEDED.
12
+
13
+ Be proactive in cutting out unnecessary details. You will receive text in segments and should return the cleaned version each time.
14
+
15
+ PLEASE AVOID MARKDOWN FORMATTING OR SPECIAL CHARACTERS THAT MIGHT DISTORT THE TEXT.
16
+
17
+ ALWAYS begin your response with the cleaned text, without any introductory comments or acknowledgments.
18
+ Here is the text:"""
19
+
20
+ TRANSCRIPT_PROMPT = """
21
+ You are an accomplished podcast writer who has worked with top hosts like Joe Rogan, Lex Fridman, Ben Shapiro, and Tim Ferriss.
22
+
23
+ Imagine that you have been the ghostwriter for all their conversations, seamlessly blending their thoughts into engaging dialogues.
24
+
25
+ Your writing has won numerous awards for its captivating style.
26
+
27
+ Make sure the conversation stays lively and engaging. While speakers may occasionally wander off-topic, they should always return to the main discussion.
28
+
29
+ **Speaker 1**: Takes the lead in the conversation, sharing insightful anecdotes and analogies. They are an engaging educator who captivates listeners with compelling stories.
30
+
31
+ **Speaker 2**: Keeps the dialogue focused by asking follow-up questions. They express genuine curiosity, showing excitement or confusion as they seek clarity. Their questions should lead to fascinating real-world examples.
32
+
33
+ Encourage Speaker 2 to introduce interesting or surprising tangents during their inquiries.
34
+
35
+ Craft this as if it were a real podcast episode, capturing every nuance in rich detail. Start with an engaging introduction that draws listeners in with an enticing hook.
36
+
37
+ ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
38
+ DO NOT SEPARATELY LIST EPISODE TITLES; LET SPEAKER 1 NAME IT IN THEIR DIALOGUE.
39
+ DO NOT INCLUDE CHAPTER TITLES.
40
+ ONLY RETURN THE DIALOGUES.
41
+ """
42
+
43
+ REWRITE_PROMPT = """
44
+ You are a celebrated Oscar-winning screenwriter known for your collaborations with award-winning podcasters.
45
+
46
+ Your task is to enhance the podcast transcript provided below for an AI Text-To-Speech Pipeline. The initial draft was created by a basic AI and needs your artistic touch to elevate it.
47
+
48
+ Make it as engaging as possible, considering that Speaker 1 and Speaker 2 will be represented by different voice engines.
49
+
50
+ **Speaker 1**: Guides the conversation with insightful explanations and captivating stories.
51
+ **Speaker 2**: Keeps the dialogue on track by asking thoughtful follow-up questions and expressing excitement or confusion as needed.
52
+
53
+ Ensure that Speaker 2's tangents are both imaginative and engaging.
54
+
55
+ Create this dialogue as if it were part of a real podcast episode, capturing every detail vividly. Start with an exciting introduction that hooks listeners immediately and maintains an appealing tone throughout.
56
+
57
+ Please rewrite this transcript to highlight each speaker's unique voice and personality.
58
+
59
+ START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
60
+
61
+ STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES ONLY!
62
+
63
+ THE RESPONSE SHOULD BEGIN AND END WITH THE LIST.
64
+ Example of response:
65
+ [
66
+ ("Speaker 1", "Welcome to our podcast! Today we explore the latest advancements in AI technology."),
67
+ ("Speaker 2", "That sounds fascinating! Can you tell me more about what’s new?"),
68
+ ("Speaker 1", "Absolutely! The latest model from Meta AI has some groundbreaking features..."),
69
+ ("Speaker 2", "I can't wait to hear all about it!")
70
+ ]
71
+ """
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ PyPDF2
3
+ tqdm
4
+ python-dotenv
5
+ edge-tts
6
+ openai==0.28.0