Spaces:
Sleeping
Sleeping
File size: 1,057 Bytes
ea83a52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import os
from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
Docx2txtLoader,
DirectoryLoader,
)
class DocumentProcessor:
def __init__(self, path: str):
self.path = path
def files_to_texts(self) -> list:
loaders_config = {
"*.pdf": PyMuPDFLoader,
"*.txt": (TextLoader, {"encoding": "utf-8"}),
"*.docx": Docx2txtLoader,
"*.doc": Docx2txtLoader,
}
loaders = [
DirectoryLoader(
path=self.path,
glob=glob,
loader_cls=loader if isinstance(loader, type) else loader[0],
loader_kwargs=loader[1] if isinstance(loader, tuple) else None,
)
for glob, loader in loaders_config.items()
if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path))
]
documents = []
for loader in loaders:
documents.extend(loader.load())
return documents
|