poemsforaphrodite commited on
Commit
abcb496
1 Parent(s): 81c2749

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -34
app.py CHANGED
@@ -7,6 +7,7 @@ from pinecone import Pinecone, ServerlessSpec
7
  from openai import OpenAI
8
  import uuid
9
  import re
 
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
@@ -46,6 +47,7 @@ else:
46
  index = pc.Index(INDEX_NAME)
47
 
48
  def transcribe_pdf(pdf_file):
 
49
  # Read PDF and extract text
50
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
51
  text = ""
@@ -54,28 +56,33 @@ def transcribe_pdf(pdf_file):
54
  if page_text:
55
  text += page_text + "\n"
56
 
 
 
57
  # Dynamic Chunking
58
  chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
 
59
 
60
- # Generate embeddings for each chunk
61
- embeddings = get_embeddings(chunks)
62
-
63
- # Prepare upsert data
64
- upsert_data = [
65
- (str(uuid.uuid4()), emb, {"text": chunk})
66
- for chunk, emb in zip(chunks, embeddings)
67
- ]
 
 
 
 
 
 
 
 
68
 
69
- # Upsert to Pinecone
70
- index.upsert(vectors=upsert_data)
71
-
72
- return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
73
 
74
  def dynamic_chunking(text, max_tokens=500, overlap=50):
75
- """
76
- Splits text into chunks with a maximum number of tokens and a specified overlap.
77
- """
78
- # Simple tokenization based on whitespace
79
  tokens = re.findall(r'\S+', text)
80
  chunks = []
81
  start = 0
@@ -84,26 +91,50 @@ def dynamic_chunking(text, max_tokens=500, overlap=50):
84
  chunk = ' '.join(tokens[start:end])
85
  chunks.append(chunk)
86
  start += max_tokens - overlap
 
87
  return chunks
88
 
89
- def get_embeddings(chunks):
90
- """
91
- Generates embeddings for each chunk using OpenAI's embedding API.
92
- """
93
- response = client.embeddings.create(
94
- input=chunks,
95
- model=EMBEDDING_MODEL
96
- )
97
- embeddings = [data.embedding for data in response.data]
98
- return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- iface = gr.Interface(
101
- fn=transcribe_pdf,
102
- inputs=gr.File(label="Upload PDF", type="binary"),
103
- outputs=gr.Textbox(label="Transcription"),
104
- title="PDF Transcription and Upsert to Pinecone",
105
- description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
106
- )
 
 
 
 
 
 
 
 
 
107
 
108
  if __name__ == "__main__":
109
- iface.launch()
 
7
  from openai import OpenAI
8
  import uuid
9
  import re
10
+ import time
11
 
12
  # Load environment variables from .env file
13
  load_dotenv()
 
47
  index = pc.Index(INDEX_NAME)
48
 
49
  def transcribe_pdf(pdf_file):
50
+ print("Starting PDF transcription...")
51
  # Read PDF and extract text
52
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
53
  text = ""
 
56
  if page_text:
57
  text += page_text + "\n"
58
 
59
+ print(f"Extracted {len(text)} characters from PDF.")
60
+
61
  # Dynamic Chunking
62
  chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
63
+ print(f"Created {len(chunks)} chunks from the extracted text.")
64
 
65
+ # Process chunks one by one
66
+ for i, chunk in enumerate(chunks):
67
+ print(f"Processing chunk {i+1}/{len(chunks)}...")
68
+
69
+ # Generate embedding for the chunk
70
+ embedding = get_embedding(chunk)
71
+
72
+ # Prepare upsert data
73
+ upsert_data = [(str(uuid.uuid4()), embedding, {"text": chunk})]
74
+
75
+ # Upsert to Pinecone
76
+ print(f"Upserting vector to Pinecone index '{INDEX_NAME}'...")
77
+ index.upsert(vectors=upsert_data)
78
+
79
+ # Optional: Add a small delay to avoid potential rate limits
80
+ time.sleep(0.5)
81
 
82
+ return f"Successfully processed and upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
 
 
 
83
 
84
  def dynamic_chunking(text, max_tokens=500, overlap=50):
85
+ print(f"Starting dynamic chunking with max_tokens={max_tokens} and overlap={overlap}...")
 
 
 
86
  tokens = re.findall(r'\S+', text)
87
  chunks = []
88
  start = 0
 
91
  chunk = ' '.join(tokens[start:end])
92
  chunks.append(chunk)
93
  start += max_tokens - overlap
94
+ print(f"Dynamic chunking complete. Created {len(chunks)} chunks.")
95
  return chunks
96
 
97
+ def get_embedding(chunk):
98
+ print("Generating embedding for chunk...")
99
+ print(chunk)
100
+ try:
101
+ response = client.embeddings.create(
102
+ input=chunk, # Now we can pass the chunk directly
103
+ model=EMBEDDING_MODEL
104
+ )
105
+ print(chunk)
106
+ embedding = response.data[0].embedding
107
+ print("Successfully generated embedding.")
108
+ return embedding
109
+ except Exception as e:
110
+ print(f"Error during embedding generation: {str(e)}")
111
+ raise e
112
+
113
+ def clear_database():
114
+ print("Clearing the Pinecone index...")
115
+ try:
116
+ index.delete(delete_all=True)
117
+ return "Successfully cleared all vectors from the Pinecone index."
118
+ except Exception as e:
119
+ print(f"Error clearing the Pinecone index: {str(e)}")
120
+ return f"Error clearing the Pinecone index: {str(e)}"
121
 
122
+ # Create the Gradio app using Blocks
123
+ with gr.Blocks() as app:
124
+ gr.Markdown("# PDF Transcription and Pinecone Database Management")
125
+
126
+ with gr.Tab("Transcribe PDF"):
127
+ gr.Markdown("Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'.")
128
+ pdf_input = gr.File(label="Upload PDF", type="binary")
129
+ transcribe_button = gr.Button("Transcribe and Upsert")
130
+ transcription_output = gr.Textbox(label="Transcription Result")
131
+ transcribe_button.click(fn=transcribe_pdf, inputs=pdf_input, outputs=transcription_output)
132
+
133
+ with gr.Tab("Clear Database"):
134
+ gr.Markdown("Click the button to clear all vectors from the Pinecone index.")
135
+ clear_button = gr.Button("Clear Database")
136
+ clear_output = gr.Textbox(label="Clear Database Result")
137
+ clear_button.click(fn=clear_database, outputs=clear_output)
138
 
139
  if __name__ == "__main__":
140
+ app.launch()