DocuQuery2 / src /loaders.py
jjovalle
app
32efd97
raw
history blame contribute delete
837 Bytes
from typing import List
from chainlit.types import AskFileResponse
from langchain.docstore.document import Document
from pypdf import PdfReader
def get_docs(files: List[AskFileResponse], splitter) -> List[str]:
docs = []
for file in files:
reader = PdfReader(file.path)
doc = [
Document(
page_content=page.extract_text(),
metadata={"source": file.path, "page": page.page_number},
)
for page in reader.pages
]
docs.append(doc)
splitted_docs = [splitter.split_documents(doc) for doc in docs]
for doc in splitted_docs:
for i, chunk in enumerate(doc, start=1):
chunk.metadata["chunk"] = i
unnested_splitted_docs = [chunk for doc in splitted_docs for chunk in doc]
return unnested_splitted_docs