from typing import List from langchain_core.documents.base import Document import os from typing import List from langchain_core.documents.base import Document from abc import ABC, abstractmethod from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from unstructured.file_utils.filetype import detect_filetype from dotenv import load_dotenv from src.utils.utils import parse_file, get_paged_text, convert_to_docs class FileProcessor(ABC): @classmethod @abstractmethod def process(self, api_key: str, file_path: str) -> str: pass @classmethod @abstractmethod def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): pass class PDFProcessor: def __init__(self) -> None: pass def process(self, file_path: str, api_key: str = None) -> str: return parse_file(api_key, file_path) def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): pass class ImageProcessor: def __init__(self) -> None: pass def process(self, file_path: str, api_key: str = None) -> str: loader = UnstructuredFileLoader(file_path) return loader.load()[0].page_content def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): pass class DOCXProcessor: def __init__(self) -> None: pass def process(self, file_path: str, api_key: str = None) -> str: return parse_file(api_key, file_path) def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): pass class Processor: def __init__(self) -> None: self.pdf_processor = PDFProcessor() self.image_processor = ImageProcessor() self.docx_processor = DOCXProcessor() self.api_key = os.environ.get("LLAMA_CLOUD_API_KEY") def process(self, file_path) -> List[Document]: filetype = detect_filetype(file_path) filetype_to_processor = { "FileType.PDF": self.pdf_processor, "FileType.DOCX": self.docx_processor, "FileType.DOC": self.docx_processor, "FileType.JPG": self.image_processor, "FileType.PNG": self.image_processor, "FileType.HEIC": self.image_processor, } processor: FileProcessor = filetype_to_processor[str(filetype)] docs_strs = get_paged_text(processor.process(file_path, self.api_key)) return convert_to_docs( docs_strs, {"source": os.path.basename(file_path), "num_chunks": len(docs_strs)}, ) if __name__ == "__main__": processor = Processor() print(processor.process("./drm_notes.docx"))