CSAle commited on
Commit
6d1c0da
·
1 Parent(s): ba43ba6

Adding PDF

Browse files
Files changed (3) hide show
  1. aimakerspace/text_utils.py +48 -0
  2. pyproject.toml +2 -1
  3. uv.lock +11 -0
aimakerspace/text_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from typing import List
 
3
 
4
 
5
  class TextFileLoader:
@@ -62,6 +63,53 @@ class CharacterTextSplitter:
62
  return chunks
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if __name__ == "__main__":
66
  loader = TextFileLoader("data/KingLear.txt")
67
  loader.load()
 
1
  import os
2
  from typing import List
3
+ import PyPDF2
4
 
5
 
6
  class TextFileLoader:
 
63
  return chunks
64
 
65
 
66
+ class PDFLoader:
67
+ def __init__(self, path: str):
68
+ self.documents = []
69
+ self.path = path
70
+
71
+ def load(self):
72
+ if os.path.isdir(self.path):
73
+ self.load_directory()
74
+ elif os.path.isfile(self.path) and self.path.lower().endswith('.pdf'):
75
+ self.load_file()
76
+ else:
77
+ raise ValueError(
78
+ "Provided path is neither a valid directory nor a PDF file."
79
+ )
80
+
81
+ def load_file(self):
82
+ with open(self.path, 'rb') as file:
83
+ # Create PDF reader object
84
+ pdf_reader = PyPDF2.PdfReader(file)
85
+
86
+ # Extract text from each page
87
+ text = ""
88
+ for page in pdf_reader.pages:
89
+ text += page.extract_text() + "\n"
90
+
91
+ self.documents.append(text)
92
+
93
+ def load_directory(self):
94
+ for root, _, files in os.walk(self.path):
95
+ for file in files:
96
+ if file.lower().endswith('.pdf'):
97
+ file_path = os.path.join(root, file)
98
+ with open(file_path, 'rb') as f:
99
+ pdf_reader = PyPDF2.PdfReader(f)
100
+
101
+ # Extract text from each page
102
+ text = ""
103
+ for page in pdf_reader.pages:
104
+ text += page.extract_text() + "\n"
105
+
106
+ self.documents.append(text)
107
+
108
+ def load_documents(self):
109
+ self.load()
110
+ return self.documents
111
+
112
+
113
  if __name__ == "__main__":
114
  loader = TextFileLoader("data/KingLear.txt")
115
  loader.load()
pyproject.toml CHANGED
@@ -9,5 +9,6 @@ dependencies = [
9
  "numpy>=2.2.2",
10
  "openai>=1.59.9",
11
  "pydantic==2.10.1",
 
12
  "websockets>=14.2",
13
- ]
 
9
  "numpy>=2.2.2",
10
  "openai>=1.59.9",
11
  "pydantic==2.10.1",
12
+ "pypdf>=5.1.0",
13
  "websockets>=14.2",
14
+ ]
uv.lock CHANGED
@@ -10,6 +10,7 @@ dependencies = [
10
  { name = "numpy" },
11
  { name = "openai" },
12
  { name = "pydantic" },
 
13
  { name = "websockets" },
14
  ]
15
 
@@ -19,6 +20,7 @@ requires-dist = [
19
  { name = "numpy", specifier = ">=2.2.2" },
20
  { name = "openai", specifier = ">=1.59.9" },
21
  { name = "pydantic", specifier = "==2.10.1" },
 
22
  { name = "websockets", specifier = ">=14.2" },
23
  ]
24
 
@@ -632,6 +634,15 @@ wheels = [
632
  { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 },
633
  ]
634
 
 
 
 
 
 
 
 
 
 
635
  [[package]]
636
  name = "python-dotenv"
637
  version = "1.0.1"
 
10
  { name = "numpy" },
11
  { name = "openai" },
12
  { name = "pydantic" },
13
+ { name = "pypdf" },
14
  { name = "websockets" },
15
  ]
16
 
 
20
  { name = "numpy", specifier = ">=2.2.2" },
21
  { name = "openai", specifier = ">=1.59.9" },
22
  { name = "pydantic", specifier = "==2.10.1" },
23
+ { name = "pypdf", specifier = ">=5.1.0" },
24
  { name = "websockets", specifier = ">=14.2" },
25
  ]
26
 
 
634
  { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 },
635
  ]
636
 
637
+ [[package]]
638
+ name = "pypdf"
639
+ version = "5.1.0"
640
+ source = { registry = "https://pypi.org/simple" }
641
+ sdist = { url = "https://files.pythonhosted.org/packages/6b/9a/72d74f05f64895ebf1c7f6646cf7fe6dd124398c5c49240093f92d6f0fdd/pypdf-5.1.0.tar.gz", hash = "sha256:425a129abb1614183fd1aca6982f650b47f8026867c0ce7c4b9f281c443d2740", size = 5011381 }
642
+ wheels = [
643
+ { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976 },
644
+ ]
645
+
646
  [[package]]
647
  name = "python-dotenv"
648
  version = "1.0.1"