umairahmad89
initial commit
67a91b0
from typing import List
from langchain_core.documents.base import Document
import os
from typing import List
from langchain_core.documents.base import Document
from abc import ABC, abstractmethod
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from unstructured.file_utils.filetype import detect_filetype
from dotenv import load_dotenv
from src.utils.utils import parse_file, get_paged_text, convert_to_docs
class FileProcessor(ABC):
@classmethod
@abstractmethod
def process(self, api_key: str, file_path: str) -> str:
pass
@classmethod
@abstractmethod
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int):
pass
class PDFProcessor:
def __init__(self) -> None:
pass
def process(self, file_path: str, api_key: str = None) -> str:
return parse_file(api_key, file_path)
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int):
pass
class ImageProcessor:
def __init__(self) -> None:
pass
def process(self, file_path: str, api_key: str = None) -> str:
loader = UnstructuredFileLoader(file_path)
return loader.load()[0].page_content
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int):
pass
class DOCXProcessor:
def __init__(self) -> None:
pass
def process(self, file_path: str, api_key: str = None) -> str:
return parse_file(api_key, file_path)
def create_chunks(self, text: str, chunk_size: int, chunk_overlap: int):
pass
class Processor:
def __init__(self) -> None:
self.pdf_processor = PDFProcessor()
self.image_processor = ImageProcessor()
self.docx_processor = DOCXProcessor()
self.api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
def process(self, file_path) -> List[Document]:
filetype = detect_filetype(file_path)
filetype_to_processor = {
"FileType.PDF": self.pdf_processor,
"FileType.DOCX": self.docx_processor,
"FileType.DOC": self.docx_processor,
"FileType.JPG": self.image_processor,
"FileType.PNG": self.image_processor,
"FileType.HEIC": self.image_processor,
}
processor: FileProcessor = filetype_to_processor[str(filetype)]
docs_strs = get_paged_text(processor.process(file_path, self.api_key))
return convert_to_docs(
docs_strs,
{"source": os.path.basename(file_path), "num_chunks": len(docs_strs)},
)
if __name__ == "__main__":
processor = Processor()
print(processor.process("./drm_notes.docx"))