RAG-PDF-QnA-ChatBot / PDF_Reader.py
Mr-TD's picture
Upload 7 files
1bdec92 verified
raw
history blame
943 Bytes
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
def read_pdf(uploaded_file):
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
def Chunks(docs):
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size = 1000,
chunk_overlap = 100,
)
doc = text_splitter.split_text(docs)
return doc
def PDF_4_QA(file):
content = read_pdf(file)
pdf_chunks = Chunks(docs=content)
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'})
vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings)
return vectorstore_openai