dgutierrez commited on
Commit
cc17218
·
1 Parent(s): e67d4b1
Files changed (2) hide show
  1. aimakerspace/text_utils.py +27 -10
  2. app.py +34 -13
aimakerspace/text_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from typing import List
 
3
 
4
 
5
  class TextFileLoader:
@@ -11,25 +12,40 @@ class TextFileLoader:
11
  def load(self):
12
  if os.path.isdir(self.path):
13
  self.load_directory()
14
- elif os.path.isfile(self.path) and self.path.endswith(".txt"):
15
- self.load_file()
 
 
 
 
 
16
  else:
17
- raise ValueError(
18
- "Provided path is neither a valid directory nor a .txt file."
19
- )
20
 
21
  def load_file(self):
22
  with open(self.path, "r", encoding=self.encoding) as f:
23
  self.documents.append(f.read())
24
 
 
 
 
 
 
 
 
25
  def load_directory(self):
26
  for root, _, files in os.walk(self.path):
27
  for file in files:
 
28
  if file.endswith(".txt"):
29
- with open(
30
- os.path.join(root, file), "r", encoding=self.encoding
31
- ) as f:
32
  self.documents.append(f.read())
 
 
 
 
 
 
33
 
34
  def load_documents(self):
35
  self.load()
@@ -52,7 +68,7 @@ class CharacterTextSplitter:
52
  def split(self, text: str) -> List[str]:
53
  chunks = []
54
  for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
55
- chunks.append(text[i : i + self.chunk_size])
56
  return chunks
57
 
58
  def split_texts(self, texts: List[str]) -> List[str]:
@@ -63,7 +79,8 @@ class CharacterTextSplitter:
63
 
64
 
65
  if __name__ == "__main__":
66
- loader = TextFileLoader("data/KingLear.txt")
 
67
  loader.load()
68
  splitter = CharacterTextSplitter()
69
  chunks = splitter.split_texts(loader.documents)
 
1
  import os
2
  from typing import List
3
+ import fitz # PyMuPDF
4
 
5
 
6
  class TextFileLoader:
 
12
  def load(self):
13
  if os.path.isdir(self.path):
14
  self.load_directory()
15
+ elif os.path.isfile(self.path):
16
+ if self.path.endswith(".txt"):
17
+ self.load_file()
18
+ elif self.path.endswith(".pdf"):
19
+ self.load_pdf()
20
+ else:
21
+ raise ValueError("Unsupported file type. Only .txt and .pdf files are supported.")
22
  else:
23
+ raise ValueError("Provided path is neither a valid directory nor a file.")
 
 
24
 
25
  def load_file(self):
26
  with open(self.path, "r", encoding=self.encoding) as f:
27
  self.documents.append(f.read())
28
 
29
+ def load_pdf(self):
30
+ with fitz.open(self.path) as doc:
31
+ text = ""
32
+ for page in doc:
33
+ text += page.get_text("text")
34
+ self.documents.append(text)
35
+
36
  def load_directory(self):
37
  for root, _, files in os.walk(self.path):
38
  for file in files:
39
+ file_path = os.path.join(root, file)
40
  if file.endswith(".txt"):
41
+ with open(file_path, "r", encoding=self.encoding) as f:
 
 
42
  self.documents.append(f.read())
43
+ elif file.endswith(".pdf"):
44
+ with fitz.open(file_path) as doc:
45
+ text = ""
46
+ for page in doc:
47
+ text += page.get_text("text")
48
+ self.documents.append(text)
49
 
50
  def load_documents(self):
51
  self.load()
 
68
  def split(self, text: str) -> List[str]:
69
  chunks = []
70
  for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
71
+ chunks.append(text[i: i + self.chunk_size])
72
  return chunks
73
 
74
  def split_texts(self, texts: List[str]) -> List[str]:
 
79
 
80
 
81
  if __name__ == "__main__":
82
+ # Example usage with a PDF file
83
+ loader = TextFileLoader("data/sample.pdf")
84
  loader.load()
85
  splitter = CharacterTextSplitter()
86
  chunks = splitter.split_texts(loader.documents)
app.py CHANGED
@@ -11,9 +11,10 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
 
14
 
15
  system_template = """\
16
- Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
17
  system_role_prompt = SystemRolePrompt(system_template)
18
 
19
  user_prompt_template = """\
@@ -49,18 +50,38 @@ class RetrievalAugmentedQAPipeline:
49
 
50
  text_splitter = CharacterTextSplitter()
51
 
52
-
53
  def process_text_file(file: AskFileResponse):
54
  import tempfile
55
 
56
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
57
- temp_file_path = temp_file.name
 
 
 
 
 
 
 
 
 
58
 
59
- with open(temp_file_path, "wb") as f:
60
- f.write(file.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- text_loader = TextFileLoader(temp_file_path)
63
- documents = text_loader.load_documents()
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
@@ -70,10 +91,10 @@ async def on_chat_start():
70
  files = None
71
 
72
  # Wait for the user to upload a file
73
- while files == None:
74
  files = await cl.AskFileMessage(
75
- content="Please upload a Text File file to begin!",
76
- accept=["text/plain"],
77
  max_size_mb=2,
78
  timeout=180,
79
  ).send()
@@ -85,7 +106,7 @@ async def on_chat_start():
85
  )
86
  await msg.send()
87
 
88
- # load the file
89
  texts = process_text_file(file)
90
 
91
  print(f"Processing {len(texts)} text chunks")
@@ -119,4 +140,4 @@ async def main(message):
119
  async for stream_resp in result["response"]:
120
  await msg.stream_token(stream_resp)
121
 
122
- await msg.send()
 
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
14
+ import fitz # PyMuPDF for PDF reading
15
 
16
  system_template = """\
17
+ Use the following context to answer a user's question. If you cannot find the answer in the context, say you don't know the answer."""
18
  system_role_prompt = SystemRolePrompt(system_template)
19
 
20
  user_prompt_template = """\
 
50
 
51
  text_splitter = CharacterTextSplitter()
52
 
 
53
  def process_text_file(file: AskFileResponse):
54
  import tempfile
55
 
56
+ file_extension = os.path.splitext(file.name)[-1].lower()
57
+
58
+ if file_extension == ".txt":
59
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
60
+ temp_file_path = temp_file.name
61
+
62
+ with open(temp_file_path, "wb") as f:
63
+ f.write(file.content)
64
+
65
+ text_loader = TextFileLoader(temp_file_path)
66
+ documents = text_loader.load_documents()
67
 
68
+ elif file_extension == ".pdf":
69
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
70
+ temp_file_path = temp_file.name
71
+
72
+ with open(temp_file_path, "wb") as f:
73
+ f.write(file.content)
74
+
75
+ documents = []
76
+ with fitz.open(temp_file_path) as doc:
77
+ text = ""
78
+ for page in doc:
79
+ text += page.get_text("text")
80
+ documents.append(text)
81
+
82
+ else:
83
+ raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
84
 
 
 
85
  texts = text_splitter.split_texts(documents)
86
  return texts
87
 
 
91
  files = None
92
 
93
  # Wait for the user to upload a file
94
+ while files is None:
95
  files = await cl.AskFileMessage(
96
+ content="Please upload a Text File or PDF to begin!",
97
+ accept=["text/plain", "application/pdf"],
98
  max_size_mb=2,
99
  timeout=180,
100
  ).send()
 
106
  )
107
  await msg.send()
108
 
109
+ # Load the file
110
  texts = process_text_file(file)
111
 
112
  print(f"Processing {len(texts)} text chunks")
 
140
  async for stream_resp in result["response"]:
141
  await msg.stream_token(stream_resp)
142
 
143
+ await msg.send()