Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Commit
•
abcb496
1
Parent(s):
81c2749
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from pinecone import Pinecone, ServerlessSpec
|
|
7 |
from openai import OpenAI
|
8 |
import uuid
|
9 |
import re
|
|
|
10 |
|
11 |
# Load environment variables from .env file
|
12 |
load_dotenv()
|
@@ -46,6 +47,7 @@ else:
|
|
46 |
index = pc.Index(INDEX_NAME)
|
47 |
|
48 |
def transcribe_pdf(pdf_file):
|
|
|
49 |
# Read PDF and extract text
|
50 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
|
51 |
text = ""
|
@@ -54,28 +56,33 @@ def transcribe_pdf(pdf_file):
|
|
54 |
if page_text:
|
55 |
text += page_text + "\n"
|
56 |
|
|
|
|
|
57 |
# Dynamic Chunking
|
58 |
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
|
|
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
index.upsert(vectors=upsert_data)
|
71 |
-
|
72 |
-
return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
|
73 |
|
74 |
def dynamic_chunking(text, max_tokens=500, overlap=50):
|
75 |
-
""
|
76 |
-
Splits text into chunks with a maximum number of tokens and a specified overlap.
|
77 |
-
"""
|
78 |
-
# Simple tokenization based on whitespace
|
79 |
tokens = re.findall(r'\S+', text)
|
80 |
chunks = []
|
81 |
start = 0
|
@@ -84,26 +91,50 @@ def dynamic_chunking(text, max_tokens=500, overlap=50):
|
|
84 |
chunk = ' '.join(tokens[start:end])
|
85 |
chunks.append(chunk)
|
86 |
start += max_tokens - overlap
|
|
|
87 |
return chunks
|
88 |
|
89 |
-
def
|
90 |
-
""
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
-
|
|
|
7 |
from openai import OpenAI
|
8 |
import uuid
|
9 |
import re
|
10 |
+
import time
|
11 |
|
12 |
# Load environment variables from .env file
|
13 |
load_dotenv()
|
|
|
47 |
index = pc.Index(INDEX_NAME)
|
48 |
|
49 |
def transcribe_pdf(pdf_file):
|
50 |
+
print("Starting PDF transcription...")
|
51 |
# Read PDF and extract text
|
52 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
|
53 |
text = ""
|
|
|
56 |
if page_text:
|
57 |
text += page_text + "\n"
|
58 |
|
59 |
+
print(f"Extracted {len(text)} characters from PDF.")
|
60 |
+
|
61 |
# Dynamic Chunking
|
62 |
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
|
63 |
+
print(f"Created {len(chunks)} chunks from the extracted text.")
|
64 |
|
65 |
+
# Process chunks one by one
|
66 |
+
for i, chunk in enumerate(chunks):
|
67 |
+
print(f"Processing chunk {i+1}/{len(chunks)}...")
|
68 |
+
|
69 |
+
# Generate embedding for the chunk
|
70 |
+
embedding = get_embedding(chunk)
|
71 |
+
|
72 |
+
# Prepare upsert data
|
73 |
+
upsert_data = [(str(uuid.uuid4()), embedding, {"text": chunk})]
|
74 |
+
|
75 |
+
# Upsert to Pinecone
|
76 |
+
print(f"Upserting vector to Pinecone index '{INDEX_NAME}'...")
|
77 |
+
index.upsert(vectors=upsert_data)
|
78 |
+
|
79 |
+
# Optional: Add a small delay to avoid potential rate limits
|
80 |
+
time.sleep(0.5)
|
81 |
|
82 |
+
return f"Successfully processed and upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
|
|
|
|
|
|
|
83 |
|
84 |
def dynamic_chunking(text, max_tokens=500, overlap=50):
|
85 |
+
print(f"Starting dynamic chunking with max_tokens={max_tokens} and overlap={overlap}...")
|
|
|
|
|
|
|
86 |
tokens = re.findall(r'\S+', text)
|
87 |
chunks = []
|
88 |
start = 0
|
|
|
91 |
chunk = ' '.join(tokens[start:end])
|
92 |
chunks.append(chunk)
|
93 |
start += max_tokens - overlap
|
94 |
+
print(f"Dynamic chunking complete. Created {len(chunks)} chunks.")
|
95 |
return chunks
|
96 |
|
97 |
+
def get_embedding(chunk):
|
98 |
+
print("Generating embedding for chunk...")
|
99 |
+
print(chunk)
|
100 |
+
try:
|
101 |
+
response = client.embeddings.create(
|
102 |
+
input=chunk, # Now we can pass the chunk directly
|
103 |
+
model=EMBEDDING_MODEL
|
104 |
+
)
|
105 |
+
print(chunk)
|
106 |
+
embedding = response.data[0].embedding
|
107 |
+
print("Successfully generated embedding.")
|
108 |
+
return embedding
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error during embedding generation: {str(e)}")
|
111 |
+
raise e
|
112 |
+
|
113 |
+
def clear_database():
|
114 |
+
print("Clearing the Pinecone index...")
|
115 |
+
try:
|
116 |
+
index.delete(delete_all=True)
|
117 |
+
return "Successfully cleared all vectors from the Pinecone index."
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error clearing the Pinecone index: {str(e)}")
|
120 |
+
return f"Error clearing the Pinecone index: {str(e)}"
|
121 |
|
122 |
+
# Create the Gradio app using Blocks
|
123 |
+
with gr.Blocks() as app:
|
124 |
+
gr.Markdown("# PDF Transcription and Pinecone Database Management")
|
125 |
+
|
126 |
+
with gr.Tab("Transcribe PDF"):
|
127 |
+
gr.Markdown("Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'.")
|
128 |
+
pdf_input = gr.File(label="Upload PDF", type="binary")
|
129 |
+
transcribe_button = gr.Button("Transcribe and Upsert")
|
130 |
+
transcription_output = gr.Textbox(label="Transcription Result")
|
131 |
+
transcribe_button.click(fn=transcribe_pdf, inputs=pdf_input, outputs=transcription_output)
|
132 |
+
|
133 |
+
with gr.Tab("Clear Database"):
|
134 |
+
gr.Markdown("Click the button to clear all vectors from the Pinecone index.")
|
135 |
+
clear_button = gr.Button("Clear Database")
|
136 |
+
clear_output = gr.Textbox(label="Clear Database Result")
|
137 |
+
clear_button.click(fn=clear_database, outputs=clear_output)
|
138 |
|
139 |
if __name__ == "__main__":
|
140 |
+
app.launch()
|