Spaces:
Sleeping
Sleeping
import re | |
from io import BytesIO | |
from typing import Tuple, List | |
import pickle | |
from langchain.docstore.document import Document | |
from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores.faiss import FAISS | |
from pypdf import PdfReader | |
import faiss | |
def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]: | |
pdf = PdfReader(file) | |
output = [] | |
for page in pdf.pages: | |
text = page.extract_text() | |
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) | |
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) | |
text = re.sub(r"\n\s*\n", "\n\n", text) | |
output.append(text) | |
return output, filename | |
def text_to_docs(text: List[str], filename: str) -> List[Document]: | |
if isinstance(text, str): | |
text = [text] | |
page_docs = [Document(page_content=page) for page in text] | |
for i, doc in enumerate(page_docs): | |
doc.metadata["page"] = i + 1 | |
doc_chunks = [] | |
for doc in page_docs: | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=4000, | |
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], | |
chunk_overlap=0, | |
) | |
chunks = text_splitter.split_text(doc.page_content) | |
for i, chunk in enumerate(chunks): | |
doc = Document( | |
page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i} | |
) | |
doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}" | |
doc.metadata["filename"] = filename # Add filename to metadata | |
doc_chunks.append(doc) | |
return doc_chunks | |
def docs_to_index(docs, huggingface_model_name): | |
# Using Hugging Face embeddings | |
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
index = FAISS.from_documents(docs, embedding_model) | |
return index | |
def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name): | |
documents = [] | |
for pdf_file, pdf_name in zip(pdf_files, pdf_names): | |
text, filename = parse_pdf(BytesIO(pdf_file), pdf_name) | |
documents = documents + text_to_docs(text, filename) | |
index = docs_to_index(documents, huggingface_model_name) | |
return index | |