import PyPDF2 from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.vectorstores import FAISS def read_pdf(uploaded_file): pdf_reader = PyPDF2.PdfReader(uploaded_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def Chunks(docs): text_splitter = RecursiveCharacterTextSplitter( # Set a really small chunk size, just to show. chunk_size = 1000, chunk_overlap = 100, ) doc = text_splitter.split_text(docs) return doc def PDF_4_QA(file): content = read_pdf(file) pdf_chunks = Chunks(docs=content) embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'}) vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings) return vectorstore_openai