File size: 1,057 Bytes
ea83a52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    TextLoader,
    Docx2txtLoader,
    DirectoryLoader,
)

class DocumentProcessor:
    def __init__(self, path: str):
        self.path = path

    def files_to_texts(self) -> list:
        loaders_config = {
            "*.pdf": PyMuPDFLoader,
            "*.txt": (TextLoader, {"encoding": "utf-8"}),
            "*.docx": Docx2txtLoader,
            "*.doc": Docx2txtLoader,
        }

        loaders = [
            DirectoryLoader(
                path=self.path,
                glob=glob,
                loader_cls=loader if isinstance(loader, type) else loader[0],
                loader_kwargs=loader[1] if isinstance(loader, tuple) else None,
            )
            for glob, loader in loaders_config.items()
            if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path))
        ]

        documents = []
        for loader in loaders:
            documents.extend(loader.load())

        return documents