|
from typing import List |
|
from langchain_core.documents.base import Document |
|
|
|
|
|
import os |
|
from typing import List |
|
from langchain_core.documents.base import Document |
|
from abc import ABC, abstractmethod |
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader |
|
from unstructured.file_utils.filetype import detect_filetype |
|
from dotenv import load_dotenv |
|
|
|
from src.utils.utils import parse_file, get_paged_text, convert_to_docs |
|
|
|
|
|
class FileProcessor(ABC): |
|
@classmethod |
|
@abstractmethod |
|
def process(self, api_key: str, file_path: str) -> str: |
|
pass |
|
|
|
@classmethod |
|
@abstractmethod |
|
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): |
|
pass |
|
|
|
|
|
class PDFProcessor: |
|
def __init__(self) -> None: |
|
pass |
|
|
|
def process(self, file_path: str, api_key: str = None) -> str: |
|
return parse_file(api_key, file_path) |
|
|
|
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): |
|
pass |
|
|
|
|
|
class ImageProcessor: |
|
def __init__(self) -> None: |
|
pass |
|
|
|
def process(self, file_path: str, api_key: str = None) -> str: |
|
loader = UnstructuredFileLoader(file_path) |
|
return loader.load()[0].page_content |
|
|
|
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): |
|
pass |
|
|
|
|
|
class DOCXProcessor: |
|
def __init__(self) -> None: |
|
pass |
|
|
|
def process(self, file_path: str, api_key: str = None) -> str: |
|
return parse_file(api_key, file_path) |
|
|
|
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int): |
|
pass |
|
|
|
|
|
class Processor: |
|
def __init__(self) -> None: |
|
self.pdf_processor = PDFProcessor() |
|
self.image_processor = ImageProcessor() |
|
self.docx_processor = DOCXProcessor() |
|
self.api_key = os.environ.get("LLAMA_CLOUD_API_KEY") |
|
|
|
def process(self, file_path) -> List[Document]: |
|
filetype = detect_filetype(file_path) |
|
filetype_to_processor = { |
|
"FileType.PDF": self.pdf_processor, |
|
"FileType.DOCX": self.docx_processor, |
|
"FileType.DOC": self.docx_processor, |
|
"FileType.JPG": self.image_processor, |
|
"FileType.PNG": self.image_processor, |
|
"FileType.HEIC": self.image_processor, |
|
} |
|
processor: FileProcessor = filetype_to_processor[str(filetype)] |
|
|
|
docs_strs = get_paged_text(processor.process(file_path, self.api_key)) |
|
|
|
return convert_to_docs( |
|
docs_strs, |
|
{"source": os.path.basename(file_path), "num_chunks": len(docs_strs)}, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
processor = Processor() |
|
print(processor.process("./drm_notes.docx")) |