import streamlit as st
import PyPDF2
import faiss
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

# Load the pre-trained model for embeddings (Sentence-Transformers)
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Open-source model from Hugging Face

# Load the language generation model (GPT-2)
generator = pipeline("text-generation", model="gpt2")

# Function to extract text from PDF
def extract_pdf_text(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    return text

# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=500):
    chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size):
        chunks.append(' '.join(words[i:i + chunk_size]))
    return chunks

# Function to generate embeddings for chunks
def generate_embeddings(chunks):
    embeddings = embedder.encode(chunks, convert_to_tensor=True)
    return embeddings

# FAISS Index Setup
def create_faiss_index(embeddings):
    d = embeddings.shape[1]  # Dimensionality of the embeddings
    index = faiss.IndexFlatL2(d)  # L2 distance metric
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

# Function to find the most relevant chunk based on the query
def find_relevant_chunk(query, index, chunks):
    query_embedding = embedder.encode([query])[0]  # Get the query embedding
    _, indices = index.search(np.array([query_embedding]), k=1)  # Find nearest chunk
    return chunks[indices[0][0]]

# Function to generate a response using GPT-2 and the relevant chunk
def generate_answer(query, relevant_chunk):
    context = relevant_chunk + "\n\n" + query  # Use chunk as context for answering
    answer = generator(context, max_length=150, num_return_sequences=1)
    return answer[0]['generated_text']

# Streamlit App Interface
def main():
    st.title("PDF Q&A with RAG System")

    # File upload
    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
    if uploaded_file is not None:
        st.write("Processing the PDF...")

        # Extract text from the uploaded PDF
        pdf_text = extract_pdf_text(uploaded_file)
        st.write("Text extracted from the PDF:")
        st.text_area("Extracted Text", pdf_text[:500], height=200)

        # Chunk the extracted text
        chunks = chunk_text(pdf_text)
        embeddings = generate_embeddings(chunks)

        # Create FAISS index
        index = create_faiss_index(embeddings)

        st.write("PDF is processed, you can now ask questions.")

        # User query input
        query = st.text_input("Ask a question about the document:")

        if query:
            # Find the most relevant chunk
            relevant_chunk = find_relevant_chunk(query, index, chunks)
            st.write("Relevant chunk found:")
            st.text_area("Relevant Chunk", relevant_chunk, height=200)

            # Generate an answer
            answer = generate_answer(query, relevant_chunk)
            st.write("Answer:", answer)

if __name__ == "__main__":
    main()