|
import streamlit as st |
|
import PyPDF2 |
|
import faiss |
|
import numpy as np |
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import os |
|
|
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
generator = pipeline("text-generation", model="gpt2") |
|
|
|
|
|
def extract_pdf_text(pdf_file): |
|
reader = PyPDF2.PdfReader(pdf_file) |
|
text = "" |
|
for page in range(len(reader.pages)): |
|
text += reader.pages[page].extract_text() |
|
return text |
|
|
|
|
|
def chunk_text(text, chunk_size=500): |
|
chunks = [] |
|
words = text.split() |
|
for i in range(0, len(words), chunk_size): |
|
chunks.append(' '.join(words[i:i + chunk_size])) |
|
return chunks |
|
|
|
|
|
def generate_embeddings(chunks): |
|
embeddings = embedder.encode(chunks, convert_to_tensor=True) |
|
return embeddings |
|
|
|
|
|
def create_faiss_index(embeddings): |
|
d = embeddings.shape[1] |
|
index = faiss.IndexFlatL2(d) |
|
index.add(np.array(embeddings)) |
|
return index |
|
|
|
|
|
def find_relevant_chunk(query, index, chunks): |
|
query_embedding = embedder.encode([query])[0] |
|
_, indices = index.search(np.array([query_embedding]), k=1) |
|
return chunks[indices[0][0]] |
|
|
|
|
|
def generate_answer(query, relevant_chunk): |
|
context = relevant_chunk + "\n\n" + query |
|
answer = generator(context, max_length=150, num_return_sequences=1) |
|
return answer[0]['generated_text'] |
|
|
|
|
|
def main(): |
|
st.title("PDF Q&A with RAG System") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") |
|
if uploaded_file is not None: |
|
st.write("Processing the PDF...") |
|
|
|
|
|
pdf_text = extract_pdf_text(uploaded_file) |
|
st.write("Text extracted from the PDF:") |
|
st.text_area("Extracted Text", pdf_text[:500], height=200) |
|
|
|
|
|
chunks = chunk_text(pdf_text) |
|
embeddings = generate_embeddings(chunks) |
|
|
|
|
|
index = create_faiss_index(embeddings) |
|
|
|
st.write("PDF is processed, you can now ask questions.") |
|
|
|
|
|
query = st.text_input("Ask a question about the document:") |
|
|
|
if query: |
|
|
|
relevant_chunk = find_relevant_chunk(query, index, chunks) |
|
st.write("Relevant chunk found:") |
|
st.text_area("Relevant Chunk", relevant_chunk, height=200) |
|
|
|
|
|
answer = generate_answer(query, relevant_chunk) |
|
st.write("Answer:", answer) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|