Spaces:
Sleeping
Sleeping
geekyrakshit
commited on
Commit
·
bd0ff68
1
Parent(s):
d529654
update: ImageLoader
Browse files- .gitignore +2 -1
- medrag_multi_modal/document_loader/load_image.py +31 -16
.gitignore
CHANGED
@@ -6,4 +6,5 @@ cursor_prompt.txt
|
|
6 |
.ruff_cache/
|
7 |
test.py
|
8 |
**.pdf
|
9 |
-
images/
|
|
|
|
6 |
.ruff_cache/
|
7 |
test.py
|
8 |
**.pdf
|
9 |
+
images/
|
10 |
+
wandb/
|
medrag_multi_modal/document_loader/load_image.py
CHANGED
@@ -1,16 +1,19 @@
|
|
1 |
import asyncio
|
|
|
|
|
2 |
|
3 |
import rich
|
4 |
import weave
|
5 |
from pdf2image.pdf2image import convert_from_path
|
6 |
from PIL import Image
|
7 |
|
|
|
8 |
from medrag_multi_modal.document_loader.load_text import TextLoader
|
9 |
|
10 |
|
11 |
class ImageLoader(TextLoader):
|
12 |
"""
|
13 |
-
ImageLoader is a class that extends the TextLoader class to handle the extraction and
|
14 |
loading of images from a PDF file.
|
15 |
|
16 |
This class provides functionality to convert specific pages of a PDF document into images
|
@@ -20,13 +23,13 @@ class ImageLoader(TextLoader):
|
|
20 |
```python
|
21 |
import asyncio
|
22 |
|
23 |
-
import
|
24 |
from dotenv import load_dotenv
|
25 |
|
26 |
from medrag_multi_modal.document_loader import ImageLoader
|
27 |
|
28 |
load_dotenv()
|
29 |
-
|
30 |
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
31 |
loader = ImageLoader(
|
32 |
url=url,
|
@@ -37,7 +40,7 @@ class ImageLoader(TextLoader):
|
|
37 |
loader.load_data(
|
38 |
start_page=31,
|
39 |
end_page=33,
|
40 |
-
|
41 |
)
|
42 |
)
|
43 |
```
|
@@ -59,7 +62,13 @@ class ImageLoader(TextLoader):
|
|
59 |
)[0]
|
60 |
return image
|
61 |
|
62 |
-
async def load_data(
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
"""
|
64 |
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
65 |
processes the images for the specified range of pages, and optionally publishes them
|
@@ -68,14 +77,15 @@ class ImageLoader(TextLoader):
|
|
68 |
This function reads the specified range of pages from a PDF document, converts each page
|
69 |
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
70 |
the image and metadata for each processed page. It processes pages concurrently using
|
71 |
-
`asyncio` for efficiency. If a
|
72 |
-
published to a Weave dataset
|
|
|
73 |
|
74 |
Args:
|
75 |
start_page (int): The starting page index (0-based) to process.
|
76 |
end_page (int): The ending page index (0-based) to process.
|
77 |
-
|
78 |
-
|
79 |
|
80 |
Returns:
|
81 |
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
@@ -85,6 +95,7 @@ class ImageLoader(TextLoader):
|
|
85 |
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
86 |
page count.
|
87 |
"""
|
|
|
88 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
89 |
pages = []
|
90 |
processed_pages_counter: int = 1
|
@@ -92,25 +103,29 @@ class ImageLoader(TextLoader):
|
|
92 |
|
93 |
async def process_page(page_idx):
|
94 |
nonlocal processed_pages_counter
|
|
|
|
|
|
|
|
|
|
|
95 |
pages.append(
|
96 |
{
|
97 |
-
"image": convert_from_path(
|
98 |
-
self.document_file_path,
|
99 |
-
first_page=page_idx + 1,
|
100 |
-
last_page=page_idx + 1,
|
101 |
-
)[0],
|
102 |
"page_idx": page_idx,
|
103 |
"document_name": self.document_name,
|
104 |
"file_path": self.document_file_path,
|
105 |
"file_url": self.url,
|
106 |
}
|
107 |
)
|
|
|
108 |
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
109 |
processed_pages_counter += 1
|
110 |
|
111 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
112 |
for task in asyncio.as_completed(tasks):
|
113 |
await task
|
114 |
-
if
|
115 |
-
|
|
|
|
|
|
|
116 |
return pages
|
|
|
1 |
import asyncio
|
2 |
+
import os
|
3 |
+
from typing import Optional
|
4 |
|
5 |
import rich
|
6 |
import weave
|
7 |
from pdf2image.pdf2image import convert_from_path
|
8 |
from PIL import Image
|
9 |
|
10 |
+
import wandb
|
11 |
from medrag_multi_modal.document_loader.load_text import TextLoader
|
12 |
|
13 |
|
14 |
class ImageLoader(TextLoader):
|
15 |
"""
|
16 |
+
ImageLoader is a class that extends the `TextLoader` class to handle the extraction and
|
17 |
loading of images from a PDF file.
|
18 |
|
19 |
This class provides functionality to convert specific pages of a PDF document into images
|
|
|
23 |
```python
|
24 |
import asyncio
|
25 |
|
26 |
+
import wandb
|
27 |
from dotenv import load_dotenv
|
28 |
|
29 |
from medrag_multi_modal.document_loader import ImageLoader
|
30 |
|
31 |
load_dotenv()
|
32 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
33 |
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
34 |
loader = ImageLoader(
|
35 |
url=url,
|
|
|
40 |
loader.load_data(
|
41 |
start_page=31,
|
42 |
end_page=33,
|
43 |
+
dataset_name="grays-anatomy-images",
|
44 |
)
|
45 |
)
|
46 |
```
|
|
|
62 |
)[0]
|
63 |
return image
|
64 |
|
65 |
+
async def load_data(
|
66 |
+
self,
|
67 |
+
start_page: int,
|
68 |
+
end_page: int,
|
69 |
+
image_save_dir: str = "./images",
|
70 |
+
dataset_name: Optional[str] = None,
|
71 |
+
):
|
72 |
"""
|
73 |
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
74 |
processes the images for the specified range of pages, and optionally publishes them
|
|
|
77 |
This function reads the specified range of pages from a PDF document, converts each page
|
78 |
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
79 |
the image and metadata for each processed page. It processes pages concurrently using
|
80 |
+
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
|
81 |
+
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
|
82 |
+
with the specified name.
|
83 |
|
84 |
Args:
|
85 |
start_page (int): The starting page index (0-based) to process.
|
86 |
end_page (int): The ending page index (0-based) to process.
|
87 |
+
dataset_name (Optional[str]): The name of the Weave dataset to publish the
|
88 |
+
processed images to. Defaults to None.
|
89 |
|
90 |
Returns:
|
91 |
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
|
|
95 |
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
96 |
page count.
|
97 |
"""
|
98 |
+
os.makedirs(image_save_dir, exist_ok=True)
|
99 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
100 |
pages = []
|
101 |
processed_pages_counter: int = 1
|
|
|
103 |
|
104 |
async def process_page(page_idx):
|
105 |
nonlocal processed_pages_counter
|
106 |
+
image = convert_from_path(
|
107 |
+
self.document_file_path,
|
108 |
+
first_page=page_idx + 1,
|
109 |
+
last_page=page_idx + 1,
|
110 |
+
)[0]
|
111 |
pages.append(
|
112 |
{
|
|
|
|
|
|
|
|
|
|
|
113 |
"page_idx": page_idx,
|
114 |
"document_name": self.document_name,
|
115 |
"file_path": self.document_file_path,
|
116 |
"file_url": self.url,
|
117 |
}
|
118 |
)
|
119 |
+
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
|
120 |
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
121 |
processed_pages_counter += 1
|
122 |
|
123 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
124 |
for task in asyncio.as_completed(tasks):
|
125 |
await task
|
126 |
+
if dataset_name:
|
127 |
+
artifact = wandb.Artifact(name=dataset_name, type="dataset")
|
128 |
+
artifact.add_dir(local_path=image_save_dir)
|
129 |
+
artifact.save()
|
130 |
+
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
|
131 |
return pages
|