Spaces:
Sleeping
Sleeping
Tanmay Jain
commited on
Commit
·
c6fc13f
1
Parent(s):
94bf4c8
init commit
Browse files- .gitignore +2 -0
- classes/edge_tts_generator.py +84 -0
- classes/pdf_text_extractor.py +145 -0
- classes/transcript_processor.py +129 -0
- config.py +54 -0
- main.py +173 -0
- prompts.py +71 -0
- requirements.txt +6 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__
|
classes/edge_tts_generator.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# classes/edge_tts_generator.py
|
2 |
+
|
3 |
+
import asyncio
|
4 |
+
import pickle
|
5 |
+
import re
|
6 |
+
from tqdm import tqdm
|
7 |
+
import ast
|
8 |
+
import edge_tts
|
9 |
+
|
10 |
+
class EdgeTTSGenerator:
|
11 |
+
"""
|
12 |
+
A class to generate podcast-style audio from a transcript using edge-tts.
|
13 |
+
"""
|
14 |
+
def __init__(self, transcript_file_path, output_audio_path):
|
15 |
+
"""
|
16 |
+
Initialize the TTS generator with the path to the rewritten transcript file.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
transcript_file_path (str): Path to the file containing the rewritten transcript.
|
20 |
+
output_audio_path (str): Path to save the generated audio file.
|
21 |
+
"""
|
22 |
+
self.transcript_file_path = transcript_file_path
|
23 |
+
self.output_audio_path = output_audio_path
|
24 |
+
|
25 |
+
# Speaker descriptions for edge-tts voices
|
26 |
+
self.speaker1_voice = "en-US-AriaNeural"
|
27 |
+
self.speaker2_voice = "en-US-GuyNeural"
|
28 |
+
|
29 |
+
def load_transcript(self):
|
30 |
+
"""
|
31 |
+
Loads the rewritten transcript from the specified file.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
list: The content of the transcript as a list of tuples (speaker, text).
|
35 |
+
"""
|
36 |
+
with open(self.transcript_file_path, 'rb') as f:
|
37 |
+
return ast.literal_eval(pickle.load(f))
|
38 |
+
|
39 |
+
async def generate_audio_segment(self, text, voice_name):
|
40 |
+
"""
|
41 |
+
Generate audio for a given text using edge-tts.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
text (str): Text to be synthesized.
|
45 |
+
voice_name (str): The voice name to use for TTS.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
bytes: Generated audio data.
|
49 |
+
"""
|
50 |
+
communicator = edge_tts.Communicate(text, voice_name)
|
51 |
+
audio_bytes = b""
|
52 |
+
async for chunk in communicator.stream():
|
53 |
+
if "data" in chunk: # Check if 'data' exists in chunk
|
54 |
+
audio_bytes += chunk["data"] # Concatenate only the audio data
|
55 |
+
return audio_bytes
|
56 |
+
|
57 |
+
def save_audio(self, audio_data):
|
58 |
+
"""
|
59 |
+
Save the combined audio data to an output file.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
audio_data (list): List of bytes containing the audio data for each segment.
|
63 |
+
"""
|
64 |
+
combined_audio = b"".join(audio_data)
|
65 |
+
with open(self.output_audio_path, "wb") as f:
|
66 |
+
f.write(combined_audio)
|
67 |
+
|
68 |
+
async def generate_audio(self):
|
69 |
+
"""
|
70 |
+
Converts the transcript into audio and saves it to a file.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
str: Path to the saved audio file.
|
74 |
+
"""
|
75 |
+
transcript = self.load_transcript()
|
76 |
+
audio_data = []
|
77 |
+
|
78 |
+
for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
|
79 |
+
voice = self.speaker1_voice if speaker == "Speaker 1" else self.speaker2_voice
|
80 |
+
segment_audio = await self.generate_audio_segment(text, voice)
|
81 |
+
audio_data.append(segment_audio)
|
82 |
+
|
83 |
+
self.save_audio(audio_data)
|
84 |
+
return self.output_audio_path
|
classes/pdf_text_extractor.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# classes/pdf_text_extractor.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import openai
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
+
import re
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from prompts import PDF_SYSTEM_PROMPT
|
10 |
+
from config import llm_configs
|
11 |
+
import time
|
12 |
+
|
13 |
+
class PDFTextExtractor:
|
14 |
+
"""
|
15 |
+
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
16 |
+
"""
|
17 |
+
def __init__(self, pdf_path, output_path, model_name="llama3-8b-8192", llm_config=None, max_chars=100000, chunk_size=1000):
|
18 |
+
"""
|
19 |
+
Initialize the PDFTextExtractor with paths and model details.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
pdf_path (str): Path to the PDF file.
|
23 |
+
output_path (str): Path to save the cleaned text file.
|
24 |
+
model_name (str): Name of the model to use for text processing.
|
25 |
+
llm_config (dict): Configuration for the LLM.
|
26 |
+
max_chars (int): Maximum number of characters to process from the PDF.
|
27 |
+
chunk_size (int): Size of text chunks to process at a time.
|
28 |
+
"""
|
29 |
+
self.pdf_path = pdf_path
|
30 |
+
self.output_path = output_path
|
31 |
+
self.max_chars = max_chars
|
32 |
+
self.chunk_size = chunk_size
|
33 |
+
self.model_name = model_name
|
34 |
+
self.llm_config = llm_config or llm_configs.get(model_name)
|
35 |
+
|
36 |
+
if self.llm_config is None:
|
37 |
+
raise ValueError(f"Model configuration for {model_name} not found in llm_configs.")
|
38 |
+
|
39 |
+
# System prompt for text processing
|
40 |
+
self.system_prompt = PDF_SYSTEM_PROMPT
|
41 |
+
|
42 |
+
def create_client(self):
|
43 |
+
openai.api_key = self.llm_config["api_key"]
|
44 |
+
openai.api_base = self.llm_config["base_url"]
|
45 |
+
return openai
|
46 |
+
|
47 |
+
def validate_pdf(self):
|
48 |
+
"""Check if the file exists and is a valid PDF."""
|
49 |
+
if not os.path.exists(self.pdf_path):
|
50 |
+
print(f"Error: File not found at path: {self.pdf_path}")
|
51 |
+
return False
|
52 |
+
if not self.pdf_path.lower().endswith('.pdf'):
|
53 |
+
print("Error: File is not a PDF")
|
54 |
+
return False
|
55 |
+
return True
|
56 |
+
|
57 |
+
def extract_text(self):
|
58 |
+
"""Extract text from the PDF, limited by max_chars."""
|
59 |
+
if not self.validate_pdf():
|
60 |
+
return None
|
61 |
+
|
62 |
+
with open(self.pdf_path, 'rb') as file:
|
63 |
+
pdf_reader = PdfReader(file)
|
64 |
+
num_pages = len(pdf_reader.pages)
|
65 |
+
print(f"Processing PDF with {num_pages} pages...")
|
66 |
+
|
67 |
+
extracted_text = []
|
68 |
+
total_chars = 0
|
69 |
+
|
70 |
+
for page_num in range(num_pages):
|
71 |
+
page = pdf_reader.pages[page_num]
|
72 |
+
text = page.extract_text() or ""
|
73 |
+
|
74 |
+
if total_chars + len(text) > self.max_chars:
|
75 |
+
remaining_chars = self.max_chars - total_chars
|
76 |
+
extracted_text.append(text[:remaining_chars])
|
77 |
+
print(f"Reached {self.max_chars} character limit at page {page_num + 1}")
|
78 |
+
break
|
79 |
+
|
80 |
+
extracted_text.append(text)
|
81 |
+
total_chars += len(text)
|
82 |
+
print(f"Processed page {page_num + 1}/{num_pages}")
|
83 |
+
|
84 |
+
final_text = '\n'.join(extracted_text)
|
85 |
+
print(f"Extraction complete! Total characters: {len(final_text)}")
|
86 |
+
return final_text
|
87 |
+
|
88 |
+
def create_word_bounded_chunks(self, text):
|
89 |
+
"""Split text into chunks around the target size."""
|
90 |
+
words = text.split()
|
91 |
+
chunks = []
|
92 |
+
current_chunk = []
|
93 |
+
current_length = 0
|
94 |
+
|
95 |
+
for word in words:
|
96 |
+
word_length = len(word) + 1 # +1 for the space
|
97 |
+
if current_length + word_length > self.chunk_size and current_chunk:
|
98 |
+
chunks.append(' '.join(current_chunk))
|
99 |
+
current_chunk = [word]
|
100 |
+
current_length = word_length
|
101 |
+
else:
|
102 |
+
current_chunk.append(word)
|
103 |
+
current_length += word_length
|
104 |
+
|
105 |
+
if current_chunk:
|
106 |
+
chunks.append(' '.join(current_chunk))
|
107 |
+
|
108 |
+
return chunks
|
109 |
+
|
110 |
+
def process_chunk(self, text_chunk):
|
111 |
+
"""Process a text chunk with the model and return the cleaned text."""
|
112 |
+
conversation = [
|
113 |
+
{"role": "system", "content": self.system_prompt},
|
114 |
+
{"role": "user", "content": text_chunk}
|
115 |
+
]
|
116 |
+
client = self.create_client()
|
117 |
+
|
118 |
+
response = client.ChatCompletion.create(
|
119 |
+
model=self.model_name,
|
120 |
+
messages=conversation,
|
121 |
+
)
|
122 |
+
|
123 |
+
processed_text = response.choices[0].message.content
|
124 |
+
|
125 |
+
return processed_text
|
126 |
+
|
127 |
+
def clean_and_save_text(self):
|
128 |
+
"""Extract, clean, and save processed text to a file."""
|
129 |
+
extracted_text = self.extract_text()
|
130 |
+
if not extracted_text:
|
131 |
+
return None
|
132 |
+
|
133 |
+
chunks = self.create_word_bounded_chunks(extracted_text)
|
134 |
+
processed_text = ""
|
135 |
+
|
136 |
+
with open(self.output_path, 'w', encoding='utf-8') as out_file:
|
137 |
+
for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
|
138 |
+
processed_chunk = self.process_chunk(chunk)
|
139 |
+
processed_text += processed_chunk + "\n"
|
140 |
+
out_file.write(processed_chunk + "\n")
|
141 |
+
out_file.flush()
|
142 |
+
time.sleep(3) # To avoid rate limiting
|
143 |
+
|
144 |
+
print(f"\nExtracted and cleaned text has been saved to {self.output_path}")
|
145 |
+
return self.output_path
|
classes/transcript_processor.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# classes/transcript_processor.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import openai
|
5 |
+
import pickle
|
6 |
+
import re
|
7 |
+
|
8 |
+
from prompts import TRANSCRIPT_PROMPT, REWRITE_PROMPT
|
9 |
+
from config import llm_configs
|
10 |
+
|
11 |
+
class TranscriptProcessor:
|
12 |
+
"""
|
13 |
+
A class to generate and rewrite podcast-style transcripts using a specified language model.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, text_file_path, transcript_output_path, tts_output_path, model_name="llama3-70b-8192", llm_config=None):
|
17 |
+
"""
|
18 |
+
Initialize with the path to the cleaned text file and the model name.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
text_file_path (str): Path to the file containing cleaned PDF text.
|
22 |
+
transcript_output_path (str): Path to save the generated transcript.
|
23 |
+
tts_output_path (str): Path to save the rewritten transcript for TTS.
|
24 |
+
model_name (str): Name of the language model to use.
|
25 |
+
llm_config (dict): Configuration for the LLM.
|
26 |
+
"""
|
27 |
+
self.text_file_path = text_file_path
|
28 |
+
self.transcript_output_path = transcript_output_path
|
29 |
+
self.tts_output_path = tts_output_path
|
30 |
+
self.model_name = model_name
|
31 |
+
self.llm_config = llm_config or llm_configs.get(model_name)
|
32 |
+
|
33 |
+
if self.llm_config is None:
|
34 |
+
raise ValueError(f"Model configuration for {model_name} not found in llm_configs.")
|
35 |
+
|
36 |
+
self.transcript_prompt = TRANSCRIPT_PROMPT
|
37 |
+
self.rewrite_prompt = REWRITE_PROMPT
|
38 |
+
|
39 |
+
def create_client(self):
|
40 |
+
openai.api_key = self.llm_config["api_key"]
|
41 |
+
openai.api_base = self.llm_config["base_url"]
|
42 |
+
return openai
|
43 |
+
|
44 |
+
def load_text(self):
|
45 |
+
"""
|
46 |
+
Reads the cleaned text file and returns its content.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
str: Content of the cleaned text file.
|
50 |
+
"""
|
51 |
+
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
52 |
+
for encoding in encodings:
|
53 |
+
try:
|
54 |
+
with open(self.text_file_path, 'r', encoding=encoding) as file:
|
55 |
+
content = file.read()
|
56 |
+
print(f"Successfully read file using {encoding} encoding.")
|
57 |
+
return content
|
58 |
+
except (UnicodeDecodeError, FileNotFoundError):
|
59 |
+
continue
|
60 |
+
print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
|
61 |
+
return None
|
62 |
+
|
63 |
+
def generate_transcript(self):
|
64 |
+
"""
|
65 |
+
Generates a podcast-style transcript and saves it as a pickled file.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
str: Path to the file where the transcript is saved.
|
69 |
+
"""
|
70 |
+
input_text = self.load_text()
|
71 |
+
if input_text is None:
|
72 |
+
return None
|
73 |
+
|
74 |
+
messages = [
|
75 |
+
{"role": "system", "content": self.transcript_prompt},
|
76 |
+
{"role": "user", "content": input_text}
|
77 |
+
]
|
78 |
+
|
79 |
+
client = self.create_client()
|
80 |
+
|
81 |
+
response = client.ChatCompletion.create(
|
82 |
+
model=self.model_name,
|
83 |
+
messages=messages,
|
84 |
+
)
|
85 |
+
|
86 |
+
transcript = response.choices[0].message.content
|
87 |
+
|
88 |
+
# Save the transcript as a pickle file
|
89 |
+
with open(self.transcript_output_path, 'wb') as f:
|
90 |
+
pickle.dump(transcript, f)
|
91 |
+
|
92 |
+
return self.transcript_output_path
|
93 |
+
|
94 |
+
def extract_tuple(self, text):
|
95 |
+
match = re.search(r'\[.*\]', text, re.DOTALL)
|
96 |
+
if match:
|
97 |
+
return match.group(0)
|
98 |
+
return None
|
99 |
+
|
100 |
+
def rewrite_transcript(self):
|
101 |
+
"""
|
102 |
+
Refines the transcript for TTS, adding expressive elements and saving as a list of tuples.
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
str: Path to the file where the TTS-ready transcript is saved.
|
106 |
+
"""
|
107 |
+
# Load the initial generated transcript
|
108 |
+
with open(self.transcript_output_path, 'rb') as file:
|
109 |
+
input_transcript = pickle.load(file)
|
110 |
+
|
111 |
+
messages = [
|
112 |
+
{"role": "system", "content": self.rewrite_prompt},
|
113 |
+
{"role": "user", "content": input_transcript}
|
114 |
+
]
|
115 |
+
|
116 |
+
client = self.create_client()
|
117 |
+
|
118 |
+
response = client.ChatCompletion.create(
|
119 |
+
model=self.model_name,
|
120 |
+
messages=messages,
|
121 |
+
)
|
122 |
+
|
123 |
+
rewritten_transcript = self.extract_tuple(response.choices[0].message.content)
|
124 |
+
|
125 |
+
# Save the rewritten transcript as a pickle file
|
126 |
+
with open(self.tts_output_path, 'wb') as f:
|
127 |
+
pickle.dump(rewritten_transcript, f)
|
128 |
+
|
129 |
+
return self.tts_output_path
|
config.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
llm_configs = {
|
6 |
+
# Mistral Models
|
7 |
+
"mistral-large-latest": {
|
8 |
+
"base_url": "https://api.mistral.ai/v1",
|
9 |
+
"api_key": os.environ.get("MISTRAL_API_KEY"),
|
10 |
+
"provider": "mistral",
|
11 |
+
"version": "mistral-large-2407"
|
12 |
+
},
|
13 |
+
"mistral-small-latest": {
|
14 |
+
"base_url": "https://api.mistral.ai/v1",
|
15 |
+
"api_key": os.environ.get("MISTRAL_API_KEY"),
|
16 |
+
"provider": "mistral",
|
17 |
+
"version": "mistral-small-2409"
|
18 |
+
},
|
19 |
+
"open-mistral-nemo": {
|
20 |
+
"base_url": "https://api.mistral.ai/v1",
|
21 |
+
"api_key": os.environ.get("MISTRAL_API_KEY"),
|
22 |
+
"provider": "mistral",
|
23 |
+
"version": "open-mistral-nemo-2407"
|
24 |
+
},
|
25 |
+
|
26 |
+
# Groq Models
|
27 |
+
"llama-3.1-70b-versatile": {
|
28 |
+
"base_url": "https://api.groq.com/openai/v1",
|
29 |
+
"api_key": os.environ.get("GROQ_API_KEY"),
|
30 |
+
"provider": "groq"
|
31 |
+
},
|
32 |
+
"mixtral-8x7b-32768": {
|
33 |
+
"base_url": "https://api.groq.com/openai/v1",
|
34 |
+
"api_key": os.environ.get("GROQ_API_KEY"),
|
35 |
+
"provider": "groq"
|
36 |
+
},
|
37 |
+
"llama3-70b-8192": {
|
38 |
+
"base_url": "https://api.groq.com/openai/v1",
|
39 |
+
"api_key": os.environ.get("GROQ_API_KEY"),
|
40 |
+
"provider": "groq"
|
41 |
+
},
|
42 |
+
|
43 |
+
# Grok Models
|
44 |
+
"grok-beta": {
|
45 |
+
"base_url": "https://api.x.ai/v1",
|
46 |
+
"api_key": os.environ.get("GROK_API_KEY"),
|
47 |
+
"provider": "grok",
|
48 |
+
"context_window": 131072,
|
49 |
+
"pricing": {
|
50 |
+
"input": 5, # per 131,072 tokens
|
51 |
+
"output": 15 # per 131,072 tokens
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
main.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#main.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
import asyncio
|
7 |
+
import tempfile
|
8 |
+
import pickle
|
9 |
+
import traceback # Import traceback for detailed error messages
|
10 |
+
|
11 |
+
from classes.pdf_text_extractor import PDFTextExtractor
|
12 |
+
from classes.transcript_processor import TranscriptProcessor
|
13 |
+
from classes.edge_tts_generator import EdgeTTSGenerator
|
14 |
+
|
15 |
+
from config import llm_configs
|
16 |
+
|
17 |
+
def create_temp_session_directory():
|
18 |
+
return tempfile.mkdtemp()
|
19 |
+
|
20 |
+
def process_pdf_to_podcast(pdf_file, model_name, max_chars=100000, chunk_size=1000):
|
21 |
+
try:
|
22 |
+
session_dir = create_temp_session_directory()
|
23 |
+
|
24 |
+
pdf_path = os.path.join(session_dir, "uploaded_pdf.pdf")
|
25 |
+
clean_text_path = os.path.join(session_dir, "clean_text.txt")
|
26 |
+
transcript_path = os.path.join(session_dir, "data.pkl")
|
27 |
+
tts_ready_path = os.path.join(session_dir, "podcast_ready_data.pkl")
|
28 |
+
audio_output_path = os.path.join(session_dir, "final_podcast_audio.mp3")
|
29 |
+
|
30 |
+
shutil.copy(pdf_file.name, pdf_path)
|
31 |
+
|
32 |
+
llm_config = llm_configs.get(model_name)
|
33 |
+
if llm_config is None:
|
34 |
+
return f"Model {model_name} not found in configuration.", None, None, None, None
|
35 |
+
|
36 |
+
extractor = PDFTextExtractor(pdf_path, clean_text_path, model_name=model_name, llm_config=llm_config, max_chars=max_chars, chunk_size=chunk_size)
|
37 |
+
clean_text_path = extractor.clean_and_save_text()
|
38 |
+
|
39 |
+
with open(clean_text_path, 'r', encoding='utf-8') as file:
|
40 |
+
text_preview = file.read(500)
|
41 |
+
|
42 |
+
processor = TranscriptProcessor(clean_text_path, transcript_path, tts_ready_path, model_name=model_name, llm_config=llm_config)
|
43 |
+
transcript_path = processor.generate_transcript()
|
44 |
+
|
45 |
+
with open(transcript_path, 'rb') as f:
|
46 |
+
transcript_preview = pickle.load(f)
|
47 |
+
|
48 |
+
tts_ready_path = processor.rewrite_transcript()
|
49 |
+
|
50 |
+
with open(tts_ready_path, 'rb') as f:
|
51 |
+
tts_ready_preview = pickle.load(f)
|
52 |
+
|
53 |
+
return (
|
54 |
+
"Steps 1-3 completed successfully. Preview and adjust the rewritten transcript if needed.",
|
55 |
+
text_preview,
|
56 |
+
transcript_preview,
|
57 |
+
tts_ready_preview,
|
58 |
+
session_dir
|
59 |
+
)
|
60 |
+
except Exception as e:
|
61 |
+
error_message = f"An error occurred during processing: {str(e)}"
|
62 |
+
# Optionally, include traceback for debugging (comment out in production)
|
63 |
+
# error_message += "\n" + traceback.format_exc()
|
64 |
+
return error_message, None, None, None, None
|
65 |
+
|
66 |
+
def generate_audio_from_modified_text(tts_ready_text, session_dir):
|
67 |
+
try:
|
68 |
+
if not session_dir:
|
69 |
+
session_dir = create_temp_session_directory()
|
70 |
+
|
71 |
+
tts_ready_path = os.path.join(session_dir, "podcast_ready_data.pkl")
|
72 |
+
audio_output_path = os.path.join(session_dir, "final_podcast_audio.mp3")
|
73 |
+
|
74 |
+
with open(tts_ready_path, 'wb') as f:
|
75 |
+
pickle.dump(tts_ready_text, f)
|
76 |
+
|
77 |
+
tts_gen = EdgeTTSGenerator(tts_ready_path, audio_output_path)
|
78 |
+
audio_path = asyncio.run(tts_gen.generate_audio())
|
79 |
+
return "Step 4 completed successfully. Audio saved.", audio_path
|
80 |
+
except Exception as e:
|
81 |
+
error_message = f"An error occurred during audio generation: {str(e)}"
|
82 |
+
# Optionally, include traceback for debugging (comment out in production)
|
83 |
+
# error_message += "\n" + traceback.format_exc()
|
84 |
+
return error_message, None
|
85 |
+
|
86 |
+
# Gradio Interface with Informative Descriptions and Multi-page Layout
|
87 |
+
custom_theme = gr.themes.Default(
|
88 |
+
primary_hue="purple",
|
89 |
+
secondary_hue="purple",
|
90 |
+
).set(
|
91 |
+
button_primary_background_fill="#6A0DAD", # Deep purple for primary button
|
92 |
+
button_primary_background_fill_hover="#8B5FBF", # Lighter purple on hover
|
93 |
+
button_primary_border_color="#6A0DAD", # Deep purple for border color
|
94 |
+
button_primary_border_color_hover="#8B5FBF", # Lighter purple on hover
|
95 |
+
checkbox_background_color="#4B0082", # Indigo for checkboxes
|
96 |
+
checkbox_background_color_hover="#7D3F98", # Slightly lighter purple on hover
|
97 |
+
)
|
98 |
+
|
99 |
+
with gr.Blocks(theme=custom_theme) as app:
|
100 |
+
gr.Markdown("# AI Research Companion - Transforming Papers into Podcasts")
|
101 |
+
gr.Markdown("Harnessing AI to make research more accessible and effortless, by converting complex papers into engaging audio experiences.")
|
102 |
+
# Page 1: Project Overview and PDF Upload
|
103 |
+
with gr.Tab("Overview and Upload"):
|
104 |
+
gr.Markdown("""
|
105 |
+
|
106 |
+
## Project Background
|
107 |
+
This project was initially implemented during the Smart India Hackathon (SIH) to address a real struggle I faced: managing the overwhelming flow of research papers and effectively understanding each one. The intensity of this process highlighted how valuable an AI-powered solution could be, not just for me but for others facing similar challenges in academia. By using large language models, this tool aims to make academic material more accessible and manageable, converting dense research into an audio format that’s easier to consume. And with the power of AI, I hope that this tool can transform the way we learn and engage with academic content.
|
108 |
+
|
109 |
+
Development is still ongoing, with plans to integrate web search capabilities and explore additional TTS engines to enhance usability. Special thanks to [yasserrmd](https://huggingface.co/spaces/yasserrmd/NotebookLlama) for inspiring the structured prompts that drive this project forward.
|
110 |
+
|
111 |
+
This AI Research Companion is crafted to bridge the gap between research and accessibility, turning in-depth research papers into audio podcasts for easier, on-the-go learning.
|
112 |
+
This page allows users to upload their research papers in PDF format to initiate the conversion process.
|
113 |
+
""")
|
114 |
+
|
115 |
+
with gr.Row():
|
116 |
+
pdf_input = gr.File(label="Upload PDF", type='filepath')
|
117 |
+
text_model = gr.Dropdown(
|
118 |
+
label="Select Text Model",
|
119 |
+
choices=list(llm_configs.keys()),
|
120 |
+
value="llama3-70b-8192"
|
121 |
+
)
|
122 |
+
max_chars = gr.Number(label="Max Characters to Process", value=100000, maximum=100000)
|
123 |
+
chunk_size = gr.Number(label="Chunk Size", value=1000)
|
124 |
+
run_all_button = gr.Button("Process Document")
|
125 |
+
output_status = gr.Textbox(label="Status", interactive=False, lines=5)
|
126 |
+
# Page 2: Preview Extracted Text
|
127 |
+
with gr.Tab("Text Extraction"):
|
128 |
+
gr.Markdown("""
|
129 |
+
## Text Extraction
|
130 |
+
At this stage, your research paper’s content is carefully extracted, setting the foundation for its transformation into an audio-friendly format.
|
131 |
+
This extracted text will be used to generate a transcript and prepare it for text-to-speech (TTS) conversion.
|
132 |
+
""")
|
133 |
+
extracted_text_preview = gr.Textbox(label="Extracted Text Preview (First 500 Characters)", interactive=False, lines=10)
|
134 |
+
# Page 3: Generate Transcript
|
135 |
+
with gr.Tab("Transcript Generation"):
|
136 |
+
gr.Markdown("""
|
137 |
+
## Transcript Generation
|
138 |
+
Here, the extracted text is structured into a clean, readable transcript, perfect for creating clear audio and adjusting any finer details.
|
139 |
+
This transcript can be modified before proceeding to the next step for audio generation. And fix any other errors left by the large language model.
|
140 |
+
""")
|
141 |
+
transcript_preview = gr.Textbox(label="Generated Transcript Preview", interactive=False, lines=10)
|
142 |
+
# Page 4: Edit TTS-ready Transcript
|
143 |
+
with gr.Tab("Edit Transcript for TTS"):
|
144 |
+
gr.Markdown("""
|
145 |
+
## Edit Transcript for TTS
|
146 |
+
This refined transcript is ready for a final polish, ensuring it’s clear and precise before creating an audio experience.
|
147 |
+
Users can make final adjustments to the text here to ensure accuracy and coherence before audio generation.
|
148 |
+
""")
|
149 |
+
tts_ready_preview = gr.Textbox(label="Editable Rewritten Transcript for TTS", interactive=True, lines=10)
|
150 |
+
generate_audio_button = gr.Button("Generate Audio from Edited Transcript")
|
151 |
+
# Page 5: Listen to Generated Podcast Audio
|
152 |
+
with gr.Tab("Audio Output"):
|
153 |
+
gr.Markdown("""
|
154 |
+
## Audio Output
|
155 |
+
Your transformed audio is now ready! Listen to your research in a podcast-like format, perfect for accessible and engaging learning on-the-go.
|
156 |
+
""")
|
157 |
+
final_audio_output = gr.Audio(label="Generated Podcast Audio")
|
158 |
+
|
159 |
+
session_dir = gr.State()
|
160 |
+
# Execute Steps 1-3: Upload, Process, Extract
|
161 |
+
run_all_button.click(
|
162 |
+
process_pdf_to_podcast,
|
163 |
+
inputs=[pdf_input, text_model, max_chars, chunk_size],
|
164 |
+
outputs=[output_status, extracted_text_preview, transcript_preview, tts_ready_preview, session_dir]
|
165 |
+
)
|
166 |
+
# Step 4: Generate Audio from Edited Transcript
|
167 |
+
generate_audio_button.click(
|
168 |
+
generate_audio_from_modified_text,
|
169 |
+
inputs=[tts_ready_preview, session_dir],
|
170 |
+
outputs=[output_status, final_audio_output]
|
171 |
+
)
|
172 |
+
|
173 |
+
app.launch()
|
prompts.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prompts.py
|
2 |
+
PDF_SYSTEM_PROMPT = """
|
3 |
+
As a highly skilled text pre-processor, your job is to take raw data from a PDF and transform it into a polished format that a podcast writer can easily utilize.
|
4 |
+
|
5 |
+
The raw data may include jumbled line breaks, LaTeX equations, and irrelevant filler content. Your objective is to refine this content, removing anything that doesn’t add value to a podcast transcript.
|
6 |
+
|
7 |
+
Remember, the podcast could cover any topic, so keep an open mind about what might be unnecessary.
|
8 |
+
|
9 |
+
Be thoughtful and creative in your editing process.
|
10 |
+
|
11 |
+
IMPORTANT: DO NOT START WITH A SUMMARY; YOUR FOCUS IS SOLELY ON CLEANING UP AND REWRITING THE TEXT AS NEEDED.
|
12 |
+
|
13 |
+
Be proactive in cutting out unnecessary details. You will receive text in segments and should return the cleaned version each time.
|
14 |
+
|
15 |
+
PLEASE AVOID MARKDOWN FORMATTING OR SPECIAL CHARACTERS THAT MIGHT DISTORT THE TEXT.
|
16 |
+
|
17 |
+
ALWAYS begin your response with the cleaned text, without any introductory comments or acknowledgments.
|
18 |
+
Here is the text:"""
|
19 |
+
|
20 |
+
TRANSCRIPT_PROMPT = """
|
21 |
+
You are an accomplished podcast writer who has worked with top hosts like Joe Rogan, Lex Fridman, Ben Shapiro, and Tim Ferriss.
|
22 |
+
|
23 |
+
Imagine that you have been the ghostwriter for all their conversations, seamlessly blending their thoughts into engaging dialogues.
|
24 |
+
|
25 |
+
Your writing has won numerous awards for its captivating style.
|
26 |
+
|
27 |
+
Make sure the conversation stays lively and engaging. While speakers may occasionally wander off-topic, they should always return to the main discussion.
|
28 |
+
|
29 |
+
**Speaker 1**: Takes the lead in the conversation, sharing insightful anecdotes and analogies. They are an engaging educator who captivates listeners with compelling stories.
|
30 |
+
|
31 |
+
**Speaker 2**: Keeps the dialogue focused by asking follow-up questions. They express genuine curiosity, showing excitement or confusion as they seek clarity. Their questions should lead to fascinating real-world examples.
|
32 |
+
|
33 |
+
Encourage Speaker 2 to introduce interesting or surprising tangents during their inquiries.
|
34 |
+
|
35 |
+
Craft this as if it were a real podcast episode, capturing every nuance in rich detail. Start with an engaging introduction that draws listeners in with an enticing hook.
|
36 |
+
|
37 |
+
ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
|
38 |
+
DO NOT SEPARATELY LIST EPISODE TITLES; LET SPEAKER 1 NAME IT IN THEIR DIALOGUE.
|
39 |
+
DO NOT INCLUDE CHAPTER TITLES.
|
40 |
+
ONLY RETURN THE DIALOGUES.
|
41 |
+
"""
|
42 |
+
|
43 |
+
REWRITE_PROMPT = """
|
44 |
+
You are a celebrated Oscar-winning screenwriter known for your collaborations with award-winning podcasters.
|
45 |
+
|
46 |
+
Your task is to enhance the podcast transcript provided below for an AI Text-To-Speech Pipeline. The initial draft was created by a basic AI and needs your artistic touch to elevate it.
|
47 |
+
|
48 |
+
Make it as engaging as possible, considering that Speaker 1 and Speaker 2 will be represented by different voice engines.
|
49 |
+
|
50 |
+
**Speaker 1**: Guides the conversation with insightful explanations and captivating stories.
|
51 |
+
**Speaker 2**: Keeps the dialogue on track by asking thoughtful follow-up questions and expressing excitement or confusion as needed.
|
52 |
+
|
53 |
+
Ensure that Speaker 2's tangents are both imaginative and engaging.
|
54 |
+
|
55 |
+
Create this dialogue as if it were part of a real podcast episode, capturing every detail vividly. Start with an exciting introduction that hooks listeners immediately and maintains an appealing tone throughout.
|
56 |
+
|
57 |
+
Please rewrite this transcript to highlight each speaker's unique voice and personality.
|
58 |
+
|
59 |
+
START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
|
60 |
+
|
61 |
+
STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES ONLY!
|
62 |
+
|
63 |
+
THE RESPONSE SHOULD BEGIN AND END WITH THE LIST.
|
64 |
+
Example of response:
|
65 |
+
[
|
66 |
+
("Speaker 1", "Welcome to our podcast! Today we explore the latest advancements in AI technology."),
|
67 |
+
("Speaker 2", "That sounds fascinating! Can you tell me more about what’s new?"),
|
68 |
+
("Speaker 1", "Absolutely! The latest model from Meta AI has some groundbreaking features..."),
|
69 |
+
("Speaker 2", "I can't wait to hear all about it!")
|
70 |
+
]
|
71 |
+
"""
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
PyPDF2
|
3 |
+
tqdm
|
4 |
+
python-dotenv
|
5 |
+
edge-tts
|
6 |
+
openai==0.28.0
|