import streamlit as st import PyPDF2 import faiss import numpy as np from transformers import pipeline, AutoTokenizer, AutoModel from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import os # Load the pre-trained model for embeddings (Sentence-Transformers) embedder = SentenceTransformer('all-MiniLM-L6-v2') # Open-source model from Hugging Face # Load the language generation model (GPT-2) generator = pipeline("text-generation", model="gpt2") # Function to extract text from PDF def extract_pdf_text(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = "" for page in range(len(reader.pages)): text += reader.pages[page].extract_text() return text # Function to chunk text into smaller pieces def chunk_text(text, chunk_size=500): chunks = [] words = text.split() for i in range(0, len(words), chunk_size): chunks.append(' '.join(words[i:i + chunk_size])) return chunks # Function to generate embeddings for chunks def generate_embeddings(chunks): embeddings = embedder.encode(chunks, convert_to_tensor=True) return embeddings # FAISS Index Setup def create_faiss_index(embeddings): d = embeddings.shape[1] # Dimensionality of the embeddings index = faiss.IndexFlatL2(d) # L2 distance metric index.add(np.array(embeddings)) # Add embeddings to the index return index # Function to find the most relevant chunk based on the query def find_relevant_chunk(query, index, chunks): query_embedding = embedder.encode([query])[0] # Get the query embedding _, indices = index.search(np.array([query_embedding]), k=1) # Find nearest chunk return chunks[indices[0][0]] # Function to generate a response using GPT-2 and the relevant chunk def generate_answer(query, relevant_chunk): context = relevant_chunk + "\n\n" + query # Use chunk as context for answering answer = generator(context, max_length=150, num_return_sequences=1) return answer[0]['generated_text'] # Streamlit App Interface def main(): st.title("PDF Q&A with RAG System") # File upload uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file is not None: st.write("Processing the PDF...") # Extract text from the uploaded PDF pdf_text = extract_pdf_text(uploaded_file) st.write("Text extracted from the PDF:") st.text_area("Extracted Text", pdf_text[:500], height=200) # Chunk the extracted text chunks = chunk_text(pdf_text) embeddings = generate_embeddings(chunks) # Create FAISS index index = create_faiss_index(embeddings) st.write("PDF is processed, you can now ask questions.") # User query input query = st.text_input("Ask a question about the document:") if query: # Find the most relevant chunk relevant_chunk = find_relevant_chunk(query, index, chunks) st.write("Relevant chunk found:") st.text_area("Relevant Chunk", relevant_chunk, height=200) # Generate an answer answer = generate_answer(query, relevant_chunk) st.write("Answer:", answer) if __name__ == "__main__": main()