PDF_QA / app.py
Yasir646's picture
Update app.py
8f12416 verified
import streamlit as st
import PyPDF2
import faiss
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
# Load the pre-trained model for embeddings (Sentence-Transformers)
embedder = SentenceTransformer('all-MiniLM-L6-v2') # Open-source model from Hugging Face
# Load the language generation model (GPT-2)
generator = pipeline("text-generation", model="gpt2")
# Function to extract text from PDF
def extract_pdf_text(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
return text
# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=500):
chunks = []
words = text.split()
for i in range(0, len(words), chunk_size):
chunks.append(' '.join(words[i:i + chunk_size]))
return chunks
# Function to generate embeddings for chunks
def generate_embeddings(chunks):
embeddings = embedder.encode(chunks, convert_to_tensor=True)
return embeddings
# FAISS Index Setup
def create_faiss_index(embeddings):
d = embeddings.shape[1] # Dimensionality of the embeddings
index = faiss.IndexFlatL2(d) # L2 distance metric
index.add(np.array(embeddings)) # Add embeddings to the index
return index
# Function to find the most relevant chunk based on the query
def find_relevant_chunk(query, index, chunks):
query_embedding = embedder.encode([query])[0] # Get the query embedding
_, indices = index.search(np.array([query_embedding]), k=1) # Find nearest chunk
return chunks[indices[0][0]]
# Function to generate a response using GPT-2 and the relevant chunk
def generate_answer(query, relevant_chunk):
context = relevant_chunk + "\n\n" + query # Use chunk as context for answering
answer = generator(context, max_length=150, num_return_sequences=1)
return answer[0]['generated_text']
# Streamlit App Interface
def main():
st.title("PDF Q&A with RAG System")
# File upload
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
st.write("Processing the PDF...")
# Extract text from the uploaded PDF
pdf_text = extract_pdf_text(uploaded_file)
st.write("Text extracted from the PDF:")
st.text_area("Extracted Text", pdf_text[:500], height=200)
# Chunk the extracted text
chunks = chunk_text(pdf_text)
embeddings = generate_embeddings(chunks)
# Create FAISS index
index = create_faiss_index(embeddings)
st.write("PDF is processed, you can now ask questions.")
# User query input
query = st.text_input("Ask a question about the document:")
if query:
# Find the most relevant chunk
relevant_chunk = find_relevant_chunk(query, index, chunks)
st.write("Relevant chunk found:")
st.text_area("Relevant Chunk", relevant_chunk, height=200)
# Generate an answer
answer = generate_answer(query, relevant_chunk)
st.write("Answer:", answer)
if __name__ == "__main__":
main()