Spaces:
Running
Running
File size: 943 Bytes
1bdec92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
def read_pdf(uploaded_file):
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
def Chunks(docs):
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size = 1000,
chunk_overlap = 100,
)
doc = text_splitter.split_text(docs)
return doc
def PDF_4_QA(file):
content = read_pdf(file)
pdf_chunks = Chunks(docs=content)
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'})
vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings)
return vectorstore_openai |