Spaces:

Yasir646
/

PDF_QA

Sleeping

App Files Files Community

PDF_QA / app.py

Yasir646

Update app.py

8f12416 verified 3 months ago

raw

history blame contribute delete

3.26 kB

	import streamlit as st
	import PyPDF2
	import faiss
	import numpy as np
	from transformers import pipeline, AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import os

	# Load the pre-trained model for embeddings (Sentence-Transformers)
	embedder = SentenceTransformer('all-MiniLM-L6-v2') # Open-source model from Hugging Face

	# Load the language generation model (GPT-2)
	generator = pipeline("text-generation", model="gpt2")

	# Function to extract text from PDF
	def extract_pdf_text(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in range(len(reader.pages)):
	text += reader.pages[page].extract_text()
	return text

	# Function to chunk text into smaller pieces
	def chunk_text(text, chunk_size=500):
	chunks = []
	words = text.split()
	for i in range(0, len(words), chunk_size):
	chunks.append(' '.join(words[i:i + chunk_size]))
	return chunks

	# Function to generate embeddings for chunks
	def generate_embeddings(chunks):
	embeddings = embedder.encode(chunks, convert_to_tensor=True)
	return embeddings

	# FAISS Index Setup
	def create_faiss_index(embeddings):
	d = embeddings.shape[1] # Dimensionality of the embeddings
	index = faiss.IndexFlatL2(d) # L2 distance metric
	index.add(np.array(embeddings)) # Add embeddings to the index
	return index

	# Function to find the most relevant chunk based on the query
	def find_relevant_chunk(query, index, chunks):
	query_embedding = embedder.encode([query])[0] # Get the query embedding
	_, indices = index.search(np.array([query_embedding]), k=1) # Find nearest chunk
	return chunks[indices[0][0]]

	# Function to generate a response using GPT-2 and the relevant chunk
	def generate_answer(query, relevant_chunk):
	context = relevant_chunk + "\n\n" + query # Use chunk as context for answering
	answer = generator(context, max_length=150, num_return_sequences=1)
	return answer[0]['generated_text']

	# Streamlit App Interface
	def main():
	st.title("PDF Q&A with RAG System")

	# File upload
	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
	if uploaded_file is not None:
	st.write("Processing the PDF...")

	# Extract text from the uploaded PDF
	pdf_text = extract_pdf_text(uploaded_file)
	st.write("Text extracted from the PDF:")
	st.text_area("Extracted Text", pdf_text[:500], height=200)

	# Chunk the extracted text
	chunks = chunk_text(pdf_text)
	embeddings = generate_embeddings(chunks)

	# Create FAISS index
	index = create_faiss_index(embeddings)

	st.write("PDF is processed, you can now ask questions.")

	# User query input
	query = st.text_input("Ask a question about the document:")

	if query:
	# Find the most relevant chunk
	relevant_chunk = find_relevant_chunk(query, index, chunks)
	st.write("Relevant chunk found:")
	st.text_area("Relevant Chunk", relevant_chunk, height=200)

	# Generate an answer
	answer = generate_answer(query, relevant_chunk)
	st.write("Answer:", answer)

	if __name__ == "__main__":
	main()