Spaces:

poemsforaphrodite
/

ghana-helper

Sleeping

App Files Files Community

poemsforaphrodite commited on Sep 17

Commit

abcb496

•

1 Parent(s): 81c2749

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -34

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pinecone import Pinecone, ServerlessSpec
 from openai import OpenAI
 import uuid
 import re
 # Load environment variables from .env file
 load_dotenv()
@@ -46,6 +47,7 @@ else:
 index = pc.Index(INDEX_NAME)
 def transcribe_pdf(pdf_file):
     # Read PDF and extract text
     pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
     text = ""
@@ -54,28 +56,33 @@ def transcribe_pdf(pdf_file):
         if page_text:
             text += page_text + "\n"
     # Dynamic Chunking
     chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
-    # Generate embeddings for each chunk
-    embeddings = get_embeddings(chunks)
-    # Prepare upsert data
-    upsert_data = [
-        (str(uuid.uuid4()), emb, {"text": chunk})
-        for chunk, emb in zip(chunks, embeddings)
-    ]
-    # Upsert to Pinecone
-    index.upsert(vectors=upsert_data)
-    return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
 def dynamic_chunking(text, max_tokens=500, overlap=50):
-    """
-    Splits text into chunks with a maximum number of tokens and a specified overlap.
-    """
-    # Simple tokenization based on whitespace
     tokens = re.findall(r'\S+', text)
     chunks = []
     start = 0
@@ -84,26 +91,50 @@ def dynamic_chunking(text, max_tokens=500, overlap=50):
         chunk = ' '.join(tokens[start:end])
         chunks.append(chunk)
         start += max_tokens - overlap
     return chunks
-def get_embeddings(chunks):
-    """
-    Generates embeddings for each chunk using OpenAI's embedding API.
-    """
-    response = client.embeddings.create(
-        input=chunks,
-        model=EMBEDDING_MODEL
-    )
-    embeddings = [data.embedding for data in response.data]
-    return embeddings
-iface = gr.Interface(
-    fn=transcribe_pdf,
-    inputs=gr.File(label="Upload PDF", type="binary"),
-    outputs=gr.Textbox(label="Transcription"),
-    title="PDF Transcription and Upsert to Pinecone",
-    description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
-)
 if __name__ == "__main__":
-    iface.launch()

 from openai import OpenAI
 import uuid
 import re
+import time
 # Load environment variables from .env file
 load_dotenv()
 index = pc.Index(INDEX_NAME)
 def transcribe_pdf(pdf_file):
+    print("Starting PDF transcription...")
     # Read PDF and extract text
     pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
     text = ""
         if page_text:
             text += page_text + "\n"
+    print(f"Extracted {len(text)} characters from PDF.")
     # Dynamic Chunking
     chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
+    print(f"Created {len(chunks)} chunks from the extracted text.")
+    # Process chunks one by one
+    for i, chunk in enumerate(chunks):
+        print(f"Processing chunk {i+1}/{len(chunks)}...")
+        # Generate embedding for the chunk
+        embedding = get_embedding(chunk)
+        # Prepare upsert data
+        upsert_data = [(str(uuid.uuid4()), embedding, {"text": chunk})]
+        # Upsert to Pinecone
+        print(f"Upserting vector to Pinecone index '{INDEX_NAME}'...")
+        index.upsert(vectors=upsert_data)
+        # Optional: Add a small delay to avoid potential rate limits
+        time.sleep(0.5)
+    return f"Successfully processed and upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
 def dynamic_chunking(text, max_tokens=500, overlap=50):
+    print(f"Starting dynamic chunking with max_tokens={max_tokens} and overlap={overlap}...")
     tokens = re.findall(r'\S+', text)
     chunks = []
     start = 0
         chunk = ' '.join(tokens[start:end])
         chunks.append(chunk)
         start += max_tokens - overlap
+    print(f"Dynamic chunking complete. Created {len(chunks)} chunks.")
     return chunks
+def get_embedding(chunk):
+    print("Generating embedding for chunk...")
+    print(chunk)
+    try:
+        response = client.embeddings.create(
+            input=chunk,  # Now we can pass the chunk directly
+            model=EMBEDDING_MODEL
+        )
+        print(chunk)
+        embedding = response.data[0].embedding
+        print("Successfully generated embedding.")
+        return embedding
+    except Exception as e:
+        print(f"Error during embedding generation: {str(e)}")
+        raise e
+def clear_database():
+    print("Clearing the Pinecone index...")
+    try:
+        index.delete(delete_all=True)
+        return "Successfully cleared all vectors from the Pinecone index."
+    except Exception as e:
+        print(f"Error clearing the Pinecone index: {str(e)}")
+        return f"Error clearing the Pinecone index: {str(e)}"
+# Create the Gradio app using Blocks
+with gr.Blocks() as app:
+    gr.Markdown("# PDF Transcription and Pinecone Database Management")
+    with gr.Tab("Transcribe PDF"):
+        gr.Markdown("Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'.")
+        pdf_input = gr.File(label="Upload PDF", type="binary")
+        transcribe_button = gr.Button("Transcribe and Upsert")
+        transcription_output = gr.Textbox(label="Transcription Result")
+        transcribe_button.click(fn=transcribe_pdf, inputs=pdf_input, outputs=transcription_output)
+    with gr.Tab("Clear Database"):
+        gr.Markdown("Click the button to clear all vectors from the Pinecone index.")
+        clear_button = gr.Button("Clear Database")
+        clear_output = gr.Textbox(label="Clear Database Result")
+        clear_button.click(fn=clear_database, outputs=clear_output)
 if __name__ == "__main__":
+    app.launch()