Spaces:

geekyrakshit
/

medrag

Sleeping

@@ -7,15 +7,6 @@ import PyPDF2
 import rich
 import weave
 from firerequests import FireRequests
-from pydantic import BaseModel
-class Page(BaseModel):
-    text: str
-    page_idx: int
-    document_name: str
-    file_path: str
-    file_url: str
 class TextLoader:
@@ -25,7 +16,7 @@ class TextLoader:
     This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
     It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
     of Page objects, which can be optionally published to a Weave dataset.
     !!! example "Example Usage"
         ```python
         import asyncio
@@ -81,7 +72,6 @@ class TextLoader:
             end_page = self.page_count - 1
         return start_page, end_page
-    @weave.op()
     async def load_data(
         self,
         start_page: Optional[int] = None,
@@ -111,7 +101,7 @@ class TextLoader:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         """
         start_page, end_page = self.get_page_indices(start_page, end_page)
-        pages: list[Page] = []
         processed_pages_counter: int = 1
         total_pages = end_page - start_page
@@ -121,13 +111,13 @@ class TextLoader:
                 doc=self.document_file_path, pages=[page_idx], show_progress=False
             )
             pages.append(
-                Page(
-                    text=text,
-                    page_idx=page_idx,
-                    document_name=self.document_name,
-                    file_path=self.document_file_path,
-                    file_url=self.url,
-                ).model_dump()
             )
             rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
             processed_pages_counter += 1

 import rich
 import weave
 from firerequests import FireRequests
 class TextLoader:
     This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
     It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
     of Page objects, which can be optionally published to a Weave dataset.
     !!! example "Example Usage"
         ```python
         import asyncio
             end_page = self.page_count - 1
         return start_page, end_page
     async def load_data(
         self,
         start_page: Optional[int] = None,
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         """
         start_page, end_page = self.get_page_indices(start_page, end_page)
+        pages = []
         processed_pages_counter: int = 1
         total_pages = end_page - start_page
                 doc=self.document_file_path, pages=[page_idx], show_progress=False
             )
             pages.append(
+                {
+                    "text": text,
+                    "page_idx": page_idx,
+                    "document_name": self.document_name,
+                    "file_path": self.document_file_path,
+                    "file_url": self.url,
+                }
             )
             rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
             processed_pages_counter += 1

medrag_multi_modal/document_loader/load_text_image.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import asyncio
+import os
+from glob import glob
+from typing import Optional
+import pymupdf4llm
+import rich
+import weave
+from PIL import Image
+from medrag_multi_modal.document_loader.load_text import TextLoader
+class TextImageLoader(TextLoader):
+    """
+    A class for loading and processing text and images from a document.
+    The TextImageLoader class extends the TextLoader class to provide
+    functionality for extracting both text and images from a document
+    specified by a URL, document name, and file path. It processes the
+    document asynchronously, allowing for efficient handling of large
+    documents.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader import TextImageLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = TextImageLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=20,
+                end_page=25,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the document to be processed.
+        document_name (str): The name of the document.
+        document_file_path (str): The file path where the document is stored.
+    """
+    def __init__(self, url: str, document_name: str, document_file_path: str):
+        super().__init__(url, document_name, document_file_path)
+    async def load_data(
+        self,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+        weave_dataset_name: Optional[str] = None,
+        image_path: Optional[str] = "./images",
+        dpi: int = 300,
+    ):
+        """
+        Asynchronously loads and processes text and images from a specified range of pages
+        in a document. This function extracts text in markdown format and images in PNG
+        format from the document, storing them in a list of dictionaries, each representing
+        a page. Optionally, the processed data can be published to a Weave dataset.
+        The function first determines the page indices to process using the
+        `get_page_indices` method. It then defines an asynchronous inner function,
+        `process_page`, which handles the extraction of text and images for a single page.
+        The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
+        retrieved from the specified image path. The processed data is appended to the
+        `pages` list.
+        The function creates a list of tasks for processing each page asynchronously and
+        awaits their completion. If a `weave_dataset_name` is provided, the processed data
+        is published to a Weave dataset. Finally, the function returns the list of processed
+        pages.
+        Args:
+            start_page (Optional[int]): The starting page index for processing. If None,
+                defaults to the first page of the document.
+            end_page (Optional[int]): The ending page index for processing. If None,
+                defaults to the last page of the document.
+            weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
+                the processed data to. If None, the data is not published.
+            image_path (Optional[str]): The directory path where extracted images are
+                stored. Defaults to "./images".
+            dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
+        Returns:
+            List[Dict]: A list of dictionaries, each containing the extracted text, page
+            index, document name, file path, file URL, and a list of images for each page
+            processed.
+        """
+        start_page, end_page = self.get_page_indices(start_page, end_page)
+        pages = []
+        processed_pages_counter: int = 1
+        total_pages = end_page - start_page
+        async def process_page(page_idx):
+            nonlocal processed_pages_counter
+            text = pymupdf4llm.to_markdown(
+                doc=self.document_file_path,
+                pages=[page_idx],
+                show_progress=False,
+                write_images=True,
+                image_format="png",
+                dpi=dpi,
+                image_path=image_path,
+            )
+            image_paths = glob(
+                os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
+            )
+            print(image_paths)
+            pages.append(
+                {
+                    "text": text,
+                    "page_idx": page_idx,
+                    "document_name": self.document_name,
+                    "file_path": self.document_file_path,
+                    "file_url": self.url,
+                    "images": [Image.open(image) for image in image_paths],
+                }
+            )
+            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
+            processed_pages_counter += 1
+        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
+        for task in asyncio.as_completed(tasks):
+            await task
+        if weave_dataset_name:
+            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
+        return pages

mkdocs.yml CHANGED Viewed

@@ -61,5 +61,6 @@ nav:
   - Home: 'index.md'
   - Document Loader:
     - Text Loader: 'document_loader/load_text.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

   - Home: 'index.md'
   - Document Loader:
     - Text Loader: 'document_loader/load_text.md'
+    - Text and Image Loader: 'document_loader/load_text_image.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal