from typing import List from chainlit.types import AskFileResponse from langchain.docstore.document import Document from pypdf import PdfReader def get_docs(files: List[AskFileResponse], splitter) -> List[str]: docs = [] for file in files: reader = PdfReader(file.path) doc = [ Document( page_content=page.extract_text(), metadata={"source": file.path, "page": page.page_number}, ) for page in reader.pages ] docs.append(doc) splitted_docs = [splitter.split_documents(doc) for doc in docs] for doc in splitted_docs: for i, chunk in enumerate(doc, start=1): chunk.metadata["chunk"] = i unnested_splitted_docs = [chunk for doc in splitted_docs for chunk in doc] return unnested_splitted_docs