geekyrakshit commited on
Commit
7b862ff
·
1 Parent(s): c675904

add: TextImageLoader

Browse files
.gitignore CHANGED
@@ -5,4 +5,5 @@ cursor_prompt.txt
5
  **pycache**
6
  .ruff_cache/
7
  test.py
8
- **.pdf
 
 
5
  **pycache**
6
  .ruff_cache/
7
  test.py
8
+ **.pdf
9
+ images/
docs/document_loader/load_text.md CHANGED
@@ -1,3 +1,3 @@
1
  ## Load text from PDF files
2
 
3
- ::: medrag_multi_modal.document_loader
 
1
  ## Load text from PDF files
2
 
3
+ ::: medrag_multi_modal.document_loader.load_text
docs/document_loader/load_text_image.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text and images from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.load_text_image
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
  from .load_text import TextLoader
 
2
 
3
- __all__ = ["TextLoader"]
 
1
  from .load_text import TextLoader
2
+ from .load_text_image import TextImageLoader
3
 
4
+ __all__ = ["TextLoader", "TextImageLoader"]
medrag_multi_modal/document_loader/load_text.py CHANGED
@@ -7,15 +7,6 @@ import PyPDF2
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
10
- from pydantic import BaseModel
11
-
12
-
13
- class Page(BaseModel):
14
- text: str
15
- page_idx: int
16
- document_name: str
17
- file_path: str
18
- file_url: str
19
 
20
 
21
  class TextLoader:
@@ -25,7 +16,7 @@ class TextLoader:
25
  This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
26
  It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
27
  of Page objects, which can be optionally published to a Weave dataset.
28
-
29
  !!! example "Example Usage"
30
  ```python
31
  import asyncio
@@ -81,7 +72,6 @@ class TextLoader:
81
  end_page = self.page_count - 1
82
  return start_page, end_page
83
 
84
- @weave.op()
85
  async def load_data(
86
  self,
87
  start_page: Optional[int] = None,
@@ -111,7 +101,7 @@ class TextLoader:
111
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
112
  """
113
  start_page, end_page = self.get_page_indices(start_page, end_page)
114
- pages: list[Page] = []
115
  processed_pages_counter: int = 1
116
  total_pages = end_page - start_page
117
 
@@ -121,13 +111,13 @@ class TextLoader:
121
  doc=self.document_file_path, pages=[page_idx], show_progress=False
122
  )
123
  pages.append(
124
- Page(
125
- text=text,
126
- page_idx=page_idx,
127
- document_name=self.document_name,
128
- file_path=self.document_file_path,
129
- file_url=self.url,
130
- ).model_dump()
131
  )
132
  rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
133
  processed_pages_counter += 1
 
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  class TextLoader:
 
16
  This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
  It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
18
  of Page objects, which can be optionally published to a Weave dataset.
19
+
20
  !!! example "Example Usage"
21
  ```python
22
  import asyncio
 
72
  end_page = self.page_count - 1
73
  return start_page, end_page
74
 
 
75
  async def load_data(
76
  self,
77
  start_page: Optional[int] = None,
 
101
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
102
  """
103
  start_page, end_page = self.get_page_indices(start_page, end_page)
104
+ pages = []
105
  processed_pages_counter: int = 1
106
  total_pages = end_page - start_page
107
 
 
111
  doc=self.document_file_path, pages=[page_idx], show_progress=False
112
  )
113
  pages.append(
114
+ {
115
+ "text": text,
116
+ "page_idx": page_idx,
117
+ "document_name": self.document_name,
118
+ "file_path": self.document_file_path,
119
+ "file_url": self.url,
120
+ }
121
  )
122
  rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
123
  processed_pages_counter += 1
medrag_multi_modal/document_loader/load_text_image.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from glob import glob
4
+ from typing import Optional
5
+
6
+ import pymupdf4llm
7
+ import rich
8
+ import weave
9
+ from PIL import Image
10
+
11
+ from medrag_multi_modal.document_loader.load_text import TextLoader
12
+
13
+
14
+ class TextImageLoader(TextLoader):
15
+ """
16
+ A class for loading and processing text and images from a document.
17
+
18
+ The TextImageLoader class extends the TextLoader class to provide
19
+ functionality for extracting both text and images from a document
20
+ specified by a URL, document name, and file path. It processes the
21
+ document asynchronously, allowing for efficient handling of large
22
+ documents.
23
+
24
+ !!! example "Example Usage"
25
+ ```python
26
+ import asyncio
27
+
28
+ import weave
29
+
30
+ from medrag_multi_modal.document_loader import TextImageLoader
31
+
32
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
33
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
+ loader = TextImageLoader(
35
+ url=url,
36
+ document_name="Gray's Anatomy",
37
+ document_file_path="grays_anatomy.pdf",
38
+ )
39
+ asyncio.run(
40
+ loader.load_data(
41
+ start_page=20,
42
+ end_page=25,
43
+ weave_dataset_name="grays-anatomy-text",
44
+ )
45
+ )
46
+ ```
47
+
48
+ Args:
49
+ url (str): The URL of the document to be processed.
50
+ document_name (str): The name of the document.
51
+ document_file_path (str): The file path where the document is stored.
52
+ """
53
+
54
+ def __init__(self, url: str, document_name: str, document_file_path: str):
55
+ super().__init__(url, document_name, document_file_path)
56
+
57
+ async def load_data(
58
+ self,
59
+ start_page: Optional[int] = None,
60
+ end_page: Optional[int] = None,
61
+ weave_dataset_name: Optional[str] = None,
62
+ image_path: Optional[str] = "./images",
63
+ dpi: int = 300,
64
+ ):
65
+ """
66
+ Asynchronously loads and processes text and images from a specified range of pages
67
+ in a document. This function extracts text in markdown format and images in PNG
68
+ format from the document, storing them in a list of dictionaries, each representing
69
+ a page. Optionally, the processed data can be published to a Weave dataset.
70
+
71
+ The function first determines the page indices to process using the
72
+ `get_page_indices` method. It then defines an asynchronous inner function,
73
+ `process_page`, which handles the extraction of text and images for a single page.
74
+ The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
75
+ retrieved from the specified image path. The processed data is appended to the
76
+ `pages` list.
77
+
78
+ The function creates a list of tasks for processing each page asynchronously and
79
+ awaits their completion. If a `weave_dataset_name` is provided, the processed data
80
+ is published to a Weave dataset. Finally, the function returns the list of processed
81
+ pages.
82
+
83
+ Args:
84
+ start_page (Optional[int]): The starting page index for processing. If None,
85
+ defaults to the first page of the document.
86
+ end_page (Optional[int]): The ending page index for processing. If None,
87
+ defaults to the last page of the document.
88
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
89
+ the processed data to. If None, the data is not published.
90
+ image_path (Optional[str]): The directory path where extracted images are
91
+ stored. Defaults to "./images".
92
+ dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
93
+
94
+ Returns:
95
+ List[Dict]: A list of dictionaries, each containing the extracted text, page
96
+ index, document name, file path, file URL, and a list of images for each page
97
+ processed.
98
+ """
99
+ start_page, end_page = self.get_page_indices(start_page, end_page)
100
+ pages = []
101
+ processed_pages_counter: int = 1
102
+ total_pages = end_page - start_page
103
+
104
+ async def process_page(page_idx):
105
+ nonlocal processed_pages_counter
106
+ text = pymupdf4llm.to_markdown(
107
+ doc=self.document_file_path,
108
+ pages=[page_idx],
109
+ show_progress=False,
110
+ write_images=True,
111
+ image_format="png",
112
+ dpi=dpi,
113
+ image_path=image_path,
114
+ )
115
+ image_paths = glob(
116
+ os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
117
+ )
118
+ print(image_paths)
119
+ pages.append(
120
+ {
121
+ "text": text,
122
+ "page_idx": page_idx,
123
+ "document_name": self.document_name,
124
+ "file_path": self.document_file_path,
125
+ "file_url": self.url,
126
+ "images": [Image.open(image) for image in image_paths],
127
+ }
128
+ )
129
+ rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
130
+ processed_pages_counter += 1
131
+
132
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
133
+ for task in asyncio.as_completed(tasks):
134
+ await task
135
+ if weave_dataset_name:
136
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
137
+ return pages
mkdocs.yml CHANGED
@@ -61,5 +61,6 @@ nav:
61
  - Home: 'index.md'
62
  - Document Loader:
63
  - Text Loader: 'document_loader/load_text.md'
 
64
 
65
  repo_url: https://github.com/soumik12345/medrag-multi-modal
 
61
  - Home: 'index.md'
62
  - Document Loader:
63
  - Text Loader: 'document_loader/load_text.md'
64
+ - Text and Image Loader: 'document_loader/load_text_image.md'
65
 
66
  repo_url: https://github.com/soumik12345/medrag-multi-modal