geekyrakshit commited on
Commit
c675904
·
1 Parent(s): 20a903b

refactor: TextLoader

Browse files
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from .load_text import load_text_from_pdf
2
 
3
- __all__ = ["load_text_from_pdf"]
 
1
+ from .load_text import TextLoader
2
 
3
+ __all__ = ["TextLoader"]
medrag_multi_modal/document_loader/load_text.py CHANGED
@@ -18,43 +18,31 @@ class Page(BaseModel):
18
  file_url: str
19
 
20
 
21
- async def load_text_from_pdf(
22
- url: str,
23
- document_name: str,
24
- document_file_path: str,
25
- start_page: Optional[int] = None,
26
- end_page: Optional[int] = None,
27
- weave_dataset_name: Optional[str] = None,
28
- ) -> list[Page]:
29
  """
30
- Asynchronously loads text from a PDF file specified by a URL or local file path,
31
- processes the text into markdown format, and optionally publishes it to a Weave dataset.
32
 
33
- This function downloads a PDF from a given URL if it does not already exist locally,
34
- reads the specified range of pages, converts each page's content to markdown, and
35
- returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
36
- the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
37
- `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
38
- to a Weave dataset.
39
-
40
- !!! example "Example usage"
41
  ```python
42
  import asyncio
43
 
44
  import weave
45
 
46
- from medrag_multi_modal.document_loader import load_text_from_pdf
47
 
48
  weave.init(project_name="ml-colabs/medrag-multi-modal")
49
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
 
 
 
 
 
50
  asyncio.run(
51
- load_text_from_pdf(
52
- url=url,
53
- document_name="Gray's Anatomy",
54
- start_page=9,
55
- end_page=15,
56
- document_file_path="grays_anatomy.pdf",
57
- )
58
  )
59
  ```
60
 
@@ -62,61 +50,91 @@ async def load_text_from_pdf(
62
  url (str): The URL of the PDF file to download if not present locally.
63
  document_name (str): The name of the document for metadata purposes.
64
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
65
- start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
66
- end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
67
- weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
68
-
69
- Returns:
70
- list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
71
-
72
- Raises:
73
- ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
74
  """
75
- if not os.path.exists(document_file_path):
76
- FireRequests().download(url, filename=document_file_path)
77
- with open(document_file_path, "rb") as file:
78
- pdf_reader = PyPDF2.PdfReader(file)
79
- page_count = len(pdf_reader.pages)
80
- print(f"Page count: {page_count}")
81
- if start_page:
82
- if start_page > page_count:
83
- raise ValueError(
84
- f"Start page {start_page} is greater than the total page count {page_count}"
85
- )
86
- else:
87
- start_page = 0
88
- if end_page:
89
- if end_page > page_count:
90
- raise ValueError(
91
- f"End page {end_page} is greater than the total page count {page_count}"
92
- )
93
- else:
94
- end_page = page_count - 1
95
-
96
- pages: list[Page] = []
97
- processed_pages_counter: int = 1
98
- total_pages = end_page - start_page
99
 
100
- async def process_page(page_idx):
101
- nonlocal processed_pages_counter
102
- text = pymupdf4llm.to_markdown(
103
- doc=document_file_path, pages=[page_idx], show_progress=False
104
- )
105
- pages.append(
106
- Page(
107
- text=text,
108
- page_idx=page_idx,
109
- document_name=document_name,
110
- file_path=document_file_path,
111
- file_url=url,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
- )
114
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
115
- processed_pages_counter += 1
116
-
117
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
118
- for task in asyncio.as_completed(tasks):
119
- await task
120
- if weave_dataset_name:
121
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
122
- return pages
 
 
 
 
 
 
 
 
 
18
  file_url: str
19
 
20
 
21
+ class TextLoader:
 
 
 
 
 
 
 
22
  """
23
+ A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
 
24
 
25
+ This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
26
+ It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
27
+ of Page objects, which can be optionally published to a Weave dataset.
28
+
29
+ !!! example "Example Usage"
 
 
 
30
  ```python
31
  import asyncio
32
 
33
  import weave
34
 
35
+ from medrag_multi_modal.document_loader import TextLoader
36
 
37
  weave.init(project_name="ml-colabs/medrag-multi-modal")
38
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
39
+ loader = TextLoader(
40
+ url=url,
41
+ document_name="Gray's Anatomy",
42
+ document_file_path="grays_anatomy.pdf",
43
+ )
44
  asyncio.run(
45
+ loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
 
 
 
 
 
 
46
  )
47
  ```
48
 
 
50
  url (str): The URL of the PDF file to download if not present locally.
51
  document_name (str): The name of the document for metadata purposes.
52
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
 
 
 
 
 
 
 
 
 
53
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ def __init__(self, url: str, document_name: str, document_file_path: str):
56
+ self.url = url
57
+ self.document_name = document_name
58
+ self.document_file_path = document_file_path
59
+ if not os.path.exists(self.document_file_path):
60
+ FireRequests().download(url, filename=self.document_file_path)
61
+ with open(self.document_file_path, "rb") as file:
62
+ pdf_reader = PyPDF2.PdfReader(file)
63
+ self.page_count = len(pdf_reader.pages)
64
+
65
+ def get_page_indices(
66
+ self, start_page: Optional[int] = None, end_page: Optional[int] = None
67
+ ):
68
+ if start_page:
69
+ if start_page > self.page_count:
70
+ raise ValueError(
71
+ f"Start page {start_page} is greater than the total page count {self.page_count}"
72
+ )
73
+ else:
74
+ start_page = 0
75
+ if end_page:
76
+ if end_page > self.page_count:
77
+ raise ValueError(
78
+ f"End page {end_page} is greater than the total page count {self.page_count}"
79
+ )
80
+ else:
81
+ end_page = self.page_count - 1
82
+ return start_page, end_page
83
+
84
+ @weave.op()
85
+ async def load_data(
86
+ self,
87
+ start_page: Optional[int] = None,
88
+ end_page: Optional[int] = None,
89
+ weave_dataset_name: Optional[str] = None,
90
+ ):
91
+ """
92
+ Asynchronously loads text from a PDF file specified by a URL or local file path,
93
+ processes the text into markdown format, and optionally publishes it to a Weave dataset.
94
+
95
+ This function downloads a PDF from a given URL if it does not already exist locally,
96
+ reads the specified range of pages, converts each page's content to markdown, and
97
+ returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
98
+ the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
99
+ `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
100
+ to a Weave dataset.
101
+
102
+ Args:
103
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
104
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
105
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
106
+
107
+ Returns:
108
+ list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
109
+
110
+ Raises:
111
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
112
+ """
113
+ start_page, end_page = self.get_page_indices(start_page, end_page)
114
+ pages: list[Page] = []
115
+ processed_pages_counter: int = 1
116
+ total_pages = end_page - start_page
117
+
118
+ async def process_page(page_idx):
119
+ nonlocal processed_pages_counter
120
+ text = pymupdf4llm.to_markdown(
121
+ doc=self.document_file_path, pages=[page_idx], show_progress=False
122
  )
123
+ pages.append(
124
+ Page(
125
+ text=text,
126
+ page_idx=page_idx,
127
+ document_name=self.document_name,
128
+ file_path=self.document_file_path,
129
+ file_url=self.url,
130
+ ).model_dump()
131
+ )
132
+ rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
133
+ processed_pages_counter += 1
134
+
135
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
136
+ for task in asyncio.as_completed(tasks):
137
+ await task
138
+ if weave_dataset_name:
139
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
140
+ return pages
mkdocs.yml CHANGED
@@ -14,8 +14,6 @@ theme:
14
  toggle:
15
  icon: material/brightness-4
16
  name: Switch to light mode
17
- logo: assets/logomark.svg
18
- favicon: assets/logomark.svg
19
  features:
20
  - content.code.annotate
21
  - content.code.copy
 
14
  toggle:
15
  icon: material/brightness-4
16
  name: Switch to light mode
 
 
17
  features:
18
  - content.code.annotate
19
  - content.code.copy