Spaces:

pattonma
/

AIE4_W2D1_RAG_Chatbot

Paused

App Files Files Community

pattonma commited on Aug 25

Commit

1318cbe

•

1 Parent(s): ca48395

Added PDF reader and changed text splitter

Browse files

Files changed (2) hide show

aimakerspace/text_utils.py +87 -1
requirements.txt +2 -1

aimakerspace/text_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List
 class TextFileLoader:
     def __init__(self, path: str, encoding: str = "utf-8"):
@@ -60,6 +61,91 @@ class CharacterTextSplitter:
         for text in texts:
             chunks.extend(self.split(text))
         return chunks
 if __name__ == "__main__":

 import os
 from typing import List
+from PyPDF2 import PdfReader
+import re
 class TextFileLoader:
     def __init__(self, path: str, encoding: str = "utf-8"):
         for text in texts:
             chunks.extend(self.split(text))
         return chunks
+class PDFLoader:
+    def __init__(self, path: str):
+        self.documents = []
+        self.path = path
+    def load(self):
+        if os.path.isdir(self.path):
+            self.load_directory()
+        elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
+            self.load_file()
+        else:
+            raise ValueError("Provided path is neither a valid directory nor a .pdf file.")
+    def load_file(self):
+        with open(self.path, 'rb') as file:
+            pdf_reader = PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+            self.documents.append(text)
+    def load_directory(self):
+        for root, _, files in os.walk(self.path):
+            for file in files:
+                if file.endswith(".pdf"):
+                    file_path = os.path.join(root, file)
+                    with open(file_path, 'rb') as f:
+                        pdf_reader = PdfReader(f)
+                        text = ""
+                        for page in pdf_reader.pages:
+                            text += page.extract_text()
+                        self.documents.append(text)
+    def load_documents(self) -> List[str]:
+        self.load()
+        return self.documents
+class SentenceTextSplitter:
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        separator: str = "\n"
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.separator = separator
+    def split(self, text: str) -> List[str]:
+        # Split the text into sentences
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for sentence in sentences:
+            sentence_size = len(sentence)
+            if current_size + sentence_size > self.chunk_size and current_chunk:
+                # If adding this sentence would exceed the chunk size, store the current chunk
+                chunks.append(self.separator.join(current_chunk))
+                # Start a new chunk, keeping some overlap
+                overlap_size = 0
+                while overlap_size < self.chunk_overlap and current_chunk:
+                    overlap_sentence = current_chunk.pop(0)
+                    overlap_size += len(overlap_sentence)
+                current_chunk = [overlap_sentence] if overlap_size < self.chunk_overlap else []
+                current_size = overlap_size
+            current_chunk.append(sentence)
+            current_size += sentence_size
+        # Add the last chunk if it's not empty
+        if current_chunk:
+            chunks.append(self.separator.join(current_chunk))
+        return chunks
+    def split_texts(self, texts: List[str]) -> List[str]:
+        chunks = []
+        for text in texts:
+            chunks.extend(self.split(text))
+        return chunks
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 numpy
 chainlit==0.7.700
-openai

 numpy
 chainlit==0.7.700
+openai
+PyPDF2