Spaces:

geekyrakshit
/

medrag

Sleeping

App Files Files Community

geekyrakshit commited on Oct 10, 2024

Commit

bd0ff68

1 Parent(s): d529654

update: ImageLoader

Browse files

Files changed (2) hide show

.gitignore +2 -1
medrag_multi_modal/document_loader/load_image.py +31 -16

.gitignore CHANGED Viewed

@@ -6,4 +6,5 @@ cursor_prompt.txt
 .ruff_cache/
 test.py
 **.pdf
-images/

 .ruff_cache/
 test.py
 **.pdf
+images/
+wandb/

medrag_multi_modal/document_loader/load_image.py CHANGED Viewed

@@ -1,16 +1,19 @@
 import asyncio
 import rich
 import weave
 from pdf2image.pdf2image import convert_from_path
 from PIL import Image
 from medrag_multi_modal.document_loader.load_text import TextLoader
 class ImageLoader(TextLoader):
     """
-    ImageLoader is a class that extends the TextLoader class to handle the extraction and
     loading of images from a PDF file.
     This class provides functionality to convert specific pages of a PDF document into images
@@ -20,13 +23,13 @@ class ImageLoader(TextLoader):
         ```python
         import asyncio
-        import weave
         from dotenv import load_dotenv
         from medrag_multi_modal.document_loader import ImageLoader
         load_dotenv()
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
         url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = ImageLoader(
             url=url,
@@ -37,7 +40,7 @@ class ImageLoader(TextLoader):
             loader.load_data(
                 start_page=31,
                 end_page=33,
-                weave_dataset_name="grays-anatomy-text",
             )
         )
         ```
@@ -59,7 +62,13 @@ class ImageLoader(TextLoader):
         )[0]
         return image
-    async def load_data(self, start_page: int, end_page: int, weave_dataset_name: str):
         """
         Asynchronously loads images from a PDF file specified by a URL or local file path,
         processes the images for the specified range of pages, and optionally publishes them
@@ -68,14 +77,15 @@ class ImageLoader(TextLoader):
         This function reads the specified range of pages from a PDF document, converts each page
         to an image using the `pdf2image` library, and returns a list of dictionaries containing
         the image and metadata for each processed page. It processes pages concurrently using
-        `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are
-        published to a Weave dataset.
         Args:
             start_page (int): The starting page index (0-based) to process.
             end_page (int): The ending page index (0-based) to process.
-            weave_dataset_name (str): The name of the Weave dataset to publish the pages to,
-                if provided.
         Returns:
             list[dict]: A list of dictionaries, each containing the image and metadata for a
@@ -85,6 +95,7 @@ class ImageLoader(TextLoader):
             ValueError: If the specified start_page or end_page is out of bounds of the document's
                 page count.
         """
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []
         processed_pages_counter: int = 1
@@ -92,25 +103,29 @@ class ImageLoader(TextLoader):
         async def process_page(page_idx):
             nonlocal processed_pages_counter
             pages.append(
                 {
-                    "image": convert_from_path(
-                        self.document_file_path,
-                        first_page=page_idx + 1,
-                        last_page=page_idx + 1,
-                    )[0],
                     "page_idx": page_idx,
                     "document_name": self.document_name,
                     "file_path": self.document_file_path,
                     "file_url": self.url,
                 }
             )
             rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
             processed_pages_counter += 1
         tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
         for task in asyncio.as_completed(tasks):
             await task
-        if weave_dataset_name:
-            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
         return pages

 import asyncio
+import os
+from typing import Optional
 import rich
 import weave
 from pdf2image.pdf2image import convert_from_path
 from PIL import Image
+import wandb
 from medrag_multi_modal.document_loader.load_text import TextLoader
 class ImageLoader(TextLoader):
     """
+    ImageLoader is a class that extends the `TextLoader` class to handle the extraction and
     loading of images from a PDF file.
     This class provides functionality to convert specific pages of a PDF document into images
         ```python
         import asyncio
+        import wandb
         from dotenv import load_dotenv
         from medrag_multi_modal.document_loader import ImageLoader
         load_dotenv()
+        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
         url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = ImageLoader(
             url=url,
             loader.load_data(
                 start_page=31,
                 end_page=33,
+                dataset_name="grays-anatomy-images",
             )
         )
         ```
         )[0]
         return image
+    async def load_data(
+        self,
+        start_page: int,
+        end_page: int,
+        image_save_dir: str = "./images",
+        dataset_name: Optional[str] = None,
+    ):
         """
         Asynchronously loads images from a PDF file specified by a URL or local file path,
         processes the images for the specified range of pages, and optionally publishes them
         This function reads the specified range of pages from a PDF document, converts each page
         to an image using the `pdf2image` library, and returns a list of dictionaries containing
         the image and metadata for each processed page. It processes pages concurrently using
+        `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
+        published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
+        with the specified name.
         Args:
             start_page (int): The starting page index (0-based) to process.
             end_page (int): The ending page index (0-based) to process.
+            dataset_name (Optional[str]): The name of the Weave dataset to publish the
+                processed images to. Defaults to None.
         Returns:
             list[dict]: A list of dictionaries, each containing the image and metadata for a
             ValueError: If the specified start_page or end_page is out of bounds of the document's
                 page count.
         """
+        os.makedirs(image_save_dir, exist_ok=True)
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []
         processed_pages_counter: int = 1
         async def process_page(page_idx):
             nonlocal processed_pages_counter
+            image = convert_from_path(
+                self.document_file_path,
+                first_page=page_idx + 1,
+                last_page=page_idx + 1,
+            )[0]
             pages.append(
                 {
                     "page_idx": page_idx,
                     "document_name": self.document_name,
                     "file_path": self.document_file_path,
                     "file_url": self.url,
                 }
             )
+            image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
             rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
             processed_pages_counter += 1
         tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
         for task in asyncio.as_completed(tasks):
             await task
+        if dataset_name:
+            artifact = wandb.Artifact(name=dataset_name, type="dataset")
+            artifact.add_dir(local_path=image_save_dir)
+            artifact.save()
+            weave.publish(weave.Dataset(name=dataset_name, rows=pages))
         return pages