import streamlit as st import os from groq import Groq import fitz # PyMuPDF for PDF parsing import numpy as np import faiss from sentence_transformers import SentenceTransformer # Hugging Face transformer from io import BytesIO # To handle file upload correctly # Initialize the Hugging Face model and Groq API client model = SentenceTransformer('all-MiniLM-L6-v2') # Model for generating embeddings GROQ_API_KEY = "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" client = Groq(api_key=GROQ_API_KEY) # Function to extract text from a PDF def extract_text_from_pdf(file): doc = fitz.open(stream=file.read(), filetype="pdf") # Use the stream and specify file type text = "" for page in doc: text += page.get_text() return text # Function to generate embeddings using Hugging Face model (for text retrieval) def generate_huggingface_embeddings(text): embeddings = model.encode(text) # Using the SentenceTransformer model return embeddings # Function to get relevant chunks from the document using FAISS similarity search def get_relevant_chunks(query, top_k=5): query_embedding = generate_huggingface_embeddings(query) # Get query embedding query_embedding = np.array(query_embedding).reshape(1, -1) # Reshape for FAISS # Perform similarity search in FAISS distances, indices = index.search(query_embedding, top_k) relevant_chunks = [document_chunks[i] for i in indices[0]] return relevant_chunks # Function to generate an answer based on retrieved context and Groq's model def generate_answer(query): relevant_chunks = get_relevant_chunks(query) context = " ".join(relevant_chunks) # Combine the most relevant chunks # Generate the response with Groq's chat model chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": f"Answer based on this: {context}"}], model="llama3-8b-8192", # Adjust with the appropriate Groq model stream=False ) return chat_completion.choices[0].message.content # Streamlit app interface st.title("Knowledge-Based Assistant") st.write("Upload a PDF to generate answers based on its content.") # Upload PDF file pdf_file = st.file_uploader("Choose a PDF file", type="pdf") if pdf_file is not None: # Extract the text content from the uploaded PDF document_text = extract_text_from_pdf(pdf_file) # Split the document into chunks (adjust chunk size as needed) chunk_size = 1000 # Size of each chunk of text for embedding document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)] # Generate embeddings for each chunk and store them embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks] # Convert embeddings to numpy arrays for FAISS embeddings_array = np.array(embeddings) # Initialize FAISS index index = faiss.IndexFlatL2(embeddings_array.shape[1]) # L2 distance metric # Add embeddings to the FAISS index index.add(embeddings_array) # Query input from user query = st.text_input("Ask a question about the document:") if query: # Generate the answer based on the query answer = generate_answer(query) st.write("Answer: ", answer)