geekyrakshit commited on
Commit
bd0ff68
·
1 Parent(s): d529654

update: ImageLoader

Browse files
.gitignore CHANGED
@@ -6,4 +6,5 @@ cursor_prompt.txt
6
  .ruff_cache/
7
  test.py
8
  **.pdf
9
- images/
 
 
6
  .ruff_cache/
7
  test.py
8
  **.pdf
9
+ images/
10
+ wandb/
medrag_multi_modal/document_loader/load_image.py CHANGED
@@ -1,16 +1,19 @@
1
  import asyncio
 
 
2
 
3
  import rich
4
  import weave
5
  from pdf2image.pdf2image import convert_from_path
6
  from PIL import Image
7
 
 
8
  from medrag_multi_modal.document_loader.load_text import TextLoader
9
 
10
 
11
  class ImageLoader(TextLoader):
12
  """
13
- ImageLoader is a class that extends the TextLoader class to handle the extraction and
14
  loading of images from a PDF file.
15
 
16
  This class provides functionality to convert specific pages of a PDF document into images
@@ -20,13 +23,13 @@ class ImageLoader(TextLoader):
20
  ```python
21
  import asyncio
22
 
23
- import weave
24
  from dotenv import load_dotenv
25
 
26
  from medrag_multi_modal.document_loader import ImageLoader
27
 
28
  load_dotenv()
29
- weave.init(project_name="ml-colabs/medrag-multi-modal")
30
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
  loader = ImageLoader(
32
  url=url,
@@ -37,7 +40,7 @@ class ImageLoader(TextLoader):
37
  loader.load_data(
38
  start_page=31,
39
  end_page=33,
40
- weave_dataset_name="grays-anatomy-text",
41
  )
42
  )
43
  ```
@@ -59,7 +62,13 @@ class ImageLoader(TextLoader):
59
  )[0]
60
  return image
61
 
62
- async def load_data(self, start_page: int, end_page: int, weave_dataset_name: str):
 
 
 
 
 
 
63
  """
64
  Asynchronously loads images from a PDF file specified by a URL or local file path,
65
  processes the images for the specified range of pages, and optionally publishes them
@@ -68,14 +77,15 @@ class ImageLoader(TextLoader):
68
  This function reads the specified range of pages from a PDF document, converts each page
69
  to an image using the `pdf2image` library, and returns a list of dictionaries containing
70
  the image and metadata for each processed page. It processes pages concurrently using
71
- `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are
72
- published to a Weave dataset.
 
73
 
74
  Args:
75
  start_page (int): The starting page index (0-based) to process.
76
  end_page (int): The ending page index (0-based) to process.
77
- weave_dataset_name (str): The name of the Weave dataset to publish the pages to,
78
- if provided.
79
 
80
  Returns:
81
  list[dict]: A list of dictionaries, each containing the image and metadata for a
@@ -85,6 +95,7 @@ class ImageLoader(TextLoader):
85
  ValueError: If the specified start_page or end_page is out of bounds of the document's
86
  page count.
87
  """
 
88
  start_page, end_page = self.get_page_indices(start_page, end_page)
89
  pages = []
90
  processed_pages_counter: int = 1
@@ -92,25 +103,29 @@ class ImageLoader(TextLoader):
92
 
93
  async def process_page(page_idx):
94
  nonlocal processed_pages_counter
 
 
 
 
 
95
  pages.append(
96
  {
97
- "image": convert_from_path(
98
- self.document_file_path,
99
- first_page=page_idx + 1,
100
- last_page=page_idx + 1,
101
- )[0],
102
  "page_idx": page_idx,
103
  "document_name": self.document_name,
104
  "file_path": self.document_file_path,
105
  "file_url": self.url,
106
  }
107
  )
 
108
  rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
109
  processed_pages_counter += 1
110
 
111
  tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
112
  for task in asyncio.as_completed(tasks):
113
  await task
114
- if weave_dataset_name:
115
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
 
 
 
116
  return pages
 
1
  import asyncio
2
+ import os
3
+ from typing import Optional
4
 
5
  import rich
6
  import weave
7
  from pdf2image.pdf2image import convert_from_path
8
  from PIL import Image
9
 
10
+ import wandb
11
  from medrag_multi_modal.document_loader.load_text import TextLoader
12
 
13
 
14
  class ImageLoader(TextLoader):
15
  """
16
+ ImageLoader is a class that extends the `TextLoader` class to handle the extraction and
17
  loading of images from a PDF file.
18
 
19
  This class provides functionality to convert specific pages of a PDF document into images
 
23
  ```python
24
  import asyncio
25
 
26
+ import wandb
27
  from dotenv import load_dotenv
28
 
29
  from medrag_multi_modal.document_loader import ImageLoader
30
 
31
  load_dotenv()
32
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
33
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
  loader = ImageLoader(
35
  url=url,
 
40
  loader.load_data(
41
  start_page=31,
42
  end_page=33,
43
+ dataset_name="grays-anatomy-images",
44
  )
45
  )
46
  ```
 
62
  )[0]
63
  return image
64
 
65
+ async def load_data(
66
+ self,
67
+ start_page: int,
68
+ end_page: int,
69
+ image_save_dir: str = "./images",
70
+ dataset_name: Optional[str] = None,
71
+ ):
72
  """
73
  Asynchronously loads images from a PDF file specified by a URL or local file path,
74
  processes the images for the specified range of pages, and optionally publishes them
 
77
  This function reads the specified range of pages from a PDF document, converts each page
78
  to an image using the `pdf2image` library, and returns a list of dictionaries containing
79
  the image and metadata for each processed page. It processes pages concurrently using
80
+ `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
81
+ published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
82
+ with the specified name.
83
 
84
  Args:
85
  start_page (int): The starting page index (0-based) to process.
86
  end_page (int): The ending page index (0-based) to process.
87
+ dataset_name (Optional[str]): The name of the Weave dataset to publish the
88
+ processed images to. Defaults to None.
89
 
90
  Returns:
91
  list[dict]: A list of dictionaries, each containing the image and metadata for a
 
95
  ValueError: If the specified start_page or end_page is out of bounds of the document's
96
  page count.
97
  """
98
+ os.makedirs(image_save_dir, exist_ok=True)
99
  start_page, end_page = self.get_page_indices(start_page, end_page)
100
  pages = []
101
  processed_pages_counter: int = 1
 
103
 
104
  async def process_page(page_idx):
105
  nonlocal processed_pages_counter
106
+ image = convert_from_path(
107
+ self.document_file_path,
108
+ first_page=page_idx + 1,
109
+ last_page=page_idx + 1,
110
+ )[0]
111
  pages.append(
112
  {
 
 
 
 
 
113
  "page_idx": page_idx,
114
  "document_name": self.document_name,
115
  "file_path": self.document_file_path,
116
  "file_url": self.url,
117
  }
118
  )
119
+ image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
120
  rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
121
  processed_pages_counter += 1
122
 
123
  tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
124
  for task in asyncio.as_completed(tasks):
125
  await task
126
+ if dataset_name:
127
+ artifact = wandb.Artifact(name=dataset_name, type="dataset")
128
+ artifact.add_dir(local_path=image_save_dir)
129
+ artifact.save()
130
+ weave.publish(weave.Dataset(name=dataset_name, rows=pages))
131
  return pages