File size: 837 Bytes
32efd97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from typing import List

from chainlit.types import AskFileResponse
from langchain.docstore.document import Document
from pypdf import PdfReader


def get_docs(files: List[AskFileResponse], splitter) -> List[str]:
    docs = []
    for file in files:
        reader = PdfReader(file.path)
        doc = [
            Document(
                page_content=page.extract_text(),
                metadata={"source": file.path, "page": page.page_number},
            )
            for page in reader.pages
        ]
        docs.append(doc)
    splitted_docs = [splitter.split_documents(doc) for doc in docs]
    for doc in splitted_docs:
        for i, chunk in enumerate(doc, start=1):
            chunk.metadata["chunk"] = i
    unnested_splitted_docs = [chunk for doc in splitted_docs for chunk in doc]
    return unnested_splitted_docs