pattonma commited on
Commit
1318cbe
1 Parent(s): ca48395

Added PDF reader and changed text splitter

Browse files
Files changed (2) hide show
  1. aimakerspace/text_utils.py +87 -1
  2. requirements.txt +2 -1
aimakerspace/text_utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  from typing import List
3
-
 
4
 
5
  class TextFileLoader:
6
  def __init__(self, path: str, encoding: str = "utf-8"):
@@ -60,6 +61,91 @@ class CharacterTextSplitter:
60
  for text in texts:
61
  chunks.extend(self.split(text))
62
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  if __name__ == "__main__":
 
1
  import os
2
  from typing import List
3
+ from PyPDF2 import PdfReader
4
+ import re
5
 
6
  class TextFileLoader:
7
  def __init__(self, path: str, encoding: str = "utf-8"):
 
61
  for text in texts:
62
  chunks.extend(self.split(text))
63
  return chunks
64
+
65
+ class PDFLoader:
66
+ def __init__(self, path: str):
67
+ self.documents = []
68
+ self.path = path
69
+
70
+ def load(self):
71
+ if os.path.isdir(self.path):
72
+ self.load_directory()
73
+ elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
74
+ self.load_file()
75
+ else:
76
+ raise ValueError("Provided path is neither a valid directory nor a .pdf file.")
77
+
78
+ def load_file(self):
79
+ with open(self.path, 'rb') as file:
80
+ pdf_reader = PdfReader(file)
81
+ text = ""
82
+ for page in pdf_reader.pages:
83
+ text += page.extract_text()
84
+ self.documents.append(text)
85
+
86
+ def load_directory(self):
87
+ for root, _, files in os.walk(self.path):
88
+ for file in files:
89
+ if file.endswith(".pdf"):
90
+ file_path = os.path.join(root, file)
91
+ with open(file_path, 'rb') as f:
92
+ pdf_reader = PdfReader(f)
93
+ text = ""
94
+ for page in pdf_reader.pages:
95
+ text += page.extract_text()
96
+ self.documents.append(text)
97
+
98
+ def load_documents(self) -> List[str]:
99
+ self.load()
100
+ return self.documents
101
+
102
+ class SentenceTextSplitter:
103
+ def __init__(
104
+ self,
105
+ chunk_size: int = 1000,
106
+ chunk_overlap: int = 200,
107
+ separator: str = "\n"
108
+ ):
109
+ self.chunk_size = chunk_size
110
+ self.chunk_overlap = chunk_overlap
111
+ self.separator = separator
112
+
113
+ def split(self, text: str) -> List[str]:
114
+ # Split the text into sentences
115
+ sentences = re.split(r'(?<=[.!?])\s+', text)
116
+ chunks = []
117
+ current_chunk = []
118
+ current_size = 0
119
+
120
+ for sentence in sentences:
121
+ sentence_size = len(sentence)
122
+
123
+ if current_size + sentence_size > self.chunk_size and current_chunk:
124
+ # If adding this sentence would exceed the chunk size, store the current chunk
125
+ chunks.append(self.separator.join(current_chunk))
126
+
127
+ # Start a new chunk, keeping some overlap
128
+ overlap_size = 0
129
+ while overlap_size < self.chunk_overlap and current_chunk:
130
+ overlap_sentence = current_chunk.pop(0)
131
+ overlap_size += len(overlap_sentence)
132
+ current_chunk = [overlap_sentence] if overlap_size < self.chunk_overlap else []
133
+ current_size = overlap_size
134
+
135
+ current_chunk.append(sentence)
136
+ current_size += sentence_size
137
+
138
+ # Add the last chunk if it's not empty
139
+ if current_chunk:
140
+ chunks.append(self.separator.join(current_chunk))
141
+
142
+ return chunks
143
+
144
+ def split_texts(self, texts: List[str]) -> List[str]:
145
+ chunks = []
146
+ for text in texts:
147
+ chunks.extend(self.split(text))
148
+ return chunks
149
 
150
 
151
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  numpy
2
  chainlit==0.7.700
3
- openai
 
 
1
  numpy
2
  chainlit==0.7.700
3
+ openai
4
+ PyPDF2