Spaces:

Yasir646
/

PDF_QA

Sleeping

App Files Files Community

Yasir646 commited on Dec 24, 2024

Commit

8f12416

verified ·

1 Parent(s): 10212ec

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -51

app.py CHANGED Viewed

@@ -1,60 +1,93 @@
 import streamlit as st
-from PyPDF2 import PdfReader
-import openai
 import os
-# Set up OpenAI API key
-openai.api_key = "your_openai_api_key"
-# Process PDF file to extract text and chunk it into manageable parts
-def process_pdf(uploaded_pdf):
-    # Read the uploaded PDF
-    pdf_reader = PdfReader(uploaded_pdf)
-    text = ''
-    for page_num in range(len(pdf_reader.pages)):
-        text += pdf_reader.pages[page_num].extract_text()
-    # Split text into chunks (a chunk of 1500 characters as an example)
-    chunks = [text[i:i+1500] for i in range(0, len(text), 1500)]
     return chunks
-# Query OpenAI GPT model
-def query_openai_gpt(context, user_query):
-    response = openai.ChatCompletion.create(
-        model="gpt-4",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant answering questions based on provided context."},
-            {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"}
-        ]
-    )
-    return response["choices"][0]["message"]["content"]
-# Main Streamlit UI
 def main():
-    st.title("PDF Query Application with OpenAI GPT")
-    uploaded_pdf = st.file_uploader("Upload your PDF", type="pdf")
-    if uploaded_pdf is not None:
-        st.write("PDF uploaded successfully! Processing...")
-        # Process the PDF and extract chunks
-        chunks = process_pdf(uploaded_pdf)
-        st.write(f"Extracted {len(chunks)} chunks from the PDF.")
-        st.write("Ready to answer questions based on the PDF!")
-        # User query
-        user_query = st.text_input("Ask a question about the PDF:")
-        if user_query:
-            # Combine chunks into context (or use retrieval techniques if needed)
-            context = " ".join(chunks[:5])  # Use the first 5 chunks for simplicity
-            st.write("Processing your question with OpenAI GPT...")
-            # Query OpenAI GPT
-            response = query_openai_gpt(context, user_query)
-            st.write(f"Answer: {response}")
 if __name__ == "__main__":
     main()

 import streamlit as st
+import PyPDF2
+import faiss
+import numpy as np
+from transformers import pipeline, AutoTokenizer, AutoModel
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
 import os
+# Load the pre-trained model for embeddings (Sentence-Transformers)
+embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Open-source model from Hugging Face
+# Load the language generation model (GPT-2)
+generator = pipeline("text-generation", model="gpt2")
+# Function to extract text from PDF
+def extract_pdf_text(pdf_file):
+    reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in range(len(reader.pages)):
+        text += reader.pages[page].extract_text()
+    return text
+# Function to chunk text into smaller pieces
+def chunk_text(text, chunk_size=500):
+    chunks = []
+    words = text.split()
+    for i in range(0, len(words), chunk_size):
+        chunks.append(' '.join(words[i:i + chunk_size]))
     return chunks
+# Function to generate embeddings for chunks
+def generate_embeddings(chunks):
+    embeddings = embedder.encode(chunks, convert_to_tensor=True)
+    return embeddings
+# FAISS Index Setup
+def create_faiss_index(embeddings):
+    d = embeddings.shape[1]  # Dimensionality of the embeddings
+    index = faiss.IndexFlatL2(d)  # L2 distance metric
+    index.add(np.array(embeddings))  # Add embeddings to the index
+    return index
+# Function to find the most relevant chunk based on the query
+def find_relevant_chunk(query, index, chunks):
+    query_embedding = embedder.encode([query])[0]  # Get the query embedding
+    _, indices = index.search(np.array([query_embedding]), k=1)  # Find nearest chunk
+    return chunks[indices[0][0]]
+# Function to generate a response using GPT-2 and the relevant chunk
+def generate_answer(query, relevant_chunk):
+    context = relevant_chunk + "\n\n" + query  # Use chunk as context for answering
+    answer = generator(context, max_length=150, num_return_sequences=1)
+    return answer[0]['generated_text']
+# Streamlit App Interface
 def main():
+    st.title("PDF Q&A with RAG System")
+    # File upload
+    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
+    if uploaded_file is not None:
+        st.write("Processing the PDF...")
+        # Extract text from the uploaded PDF
+        pdf_text = extract_pdf_text(uploaded_file)
+        st.write("Text extracted from the PDF:")
+        st.text_area("Extracted Text", pdf_text[:500], height=200)
+        # Chunk the extracted text
+        chunks = chunk_text(pdf_text)
+        embeddings = generate_embeddings(chunks)
+        # Create FAISS index
+        index = create_faiss_index(embeddings)
+        st.write("PDF is processed, you can now ask questions.")
+        # User query input
+        query = st.text_input("Ask a question about the document:")
+        if query:
+            # Find the most relevant chunk
+            relevant_chunk = find_relevant_chunk(query, index, chunks)
+            st.write("Relevant chunk found:")
+            st.text_area("Relevant Chunk", relevant_chunk, height=200)
+            # Generate an answer
+            answer = generate_answer(query, relevant_chunk)
+            st.write("Answer:", answer)
 if __name__ == "__main__":
     main()