captain-awesome commited on
Commit
ef01944
·
1 Parent(s): 6b347f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -28
app.py CHANGED
@@ -45,6 +45,24 @@ import os
45
  from langchain.llms import CTransformers
46
  import streamlit as st
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def load_model():
49
  # model_path=HuggingFaceHub(repo_id="vilsonrodrigues/falcon-7b-instruct-sharded")
50
 
@@ -94,9 +112,24 @@ def load_model():
94
  # temperature=temperature, # type: ignore
95
  )
96
  return llm
97
-
98
 
99
- def create_vector_database(uploaded_files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # DB_DIR: str = os.path.join(ABS_PATH, "db")
101
  """
102
  Creates a vector database using document loaders and embeddings.
@@ -107,21 +140,21 @@ def create_vector_database(uploaded_files):
107
 
108
  """
109
  # Initialize loaders for different file types
110
- loaders = {
111
- "pdf": UnstructuredPDFLoader,
112
- "md": UnstructuredMarkdownLoader,
113
- "txt": TextLoader,
114
- "csv": CSVLoader,
115
- "py": PythonLoader,
116
- "epub": UnstructuredEPubLoader,
117
- "html": UnstructuredHTMLLoader,
118
- "ppt": UnstructuredPowerPointLoader,
119
- "pptx": UnstructuredPowerPointLoader,
120
- "doc": UnstructuredWordDocumentLoader,
121
- "docx": UnstructuredWordDocumentLoader,
122
- "odt": UnstructuredODTLoader,
123
- "ipynb": NotebookLoader
124
- }
125
  # pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
126
  # markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
127
  # text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
@@ -154,17 +187,17 @@ def create_vector_database(uploaded_files):
154
  # }
155
 
156
  # Load documents from uploaded files using the appropriate loaders
157
- loaded_documents = []
158
- for uploaded_file in uploaded_files:
159
- # file_extension = os.path.splitext(uploaded_file.name)[-1].lower()[1:]
160
- file_extension = os.path.splitext(uploaded_file.name)[-1][1:].lower()
161
- if file_extension in loaders:
162
- # Read the content of the uploaded file
163
- file_content = uploaded_file.read()
164
 
165
- # Pass the content to the loader for processing
166
- loader = loaders[file_extension](file_content)
167
- loaded_documents.extend(loader.load())
168
  # loader = loaders[file_extension](uploaded_file)
169
  # # loader = loader_cls.load(uploaded_file.name) # Pass the file path to the loader constructor
170
  # # # content = uploaded_file.read() # Read the file content
@@ -334,7 +367,8 @@ def main():
334
  llm = load_model()
335
  prompt = set_custom_prompt()
336
  CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
337
- db = create_vector_database(uploaded_files)
 
338
  response = retrieve_bot_answer(query)
339
 
340
  # Display bot response
 
45
  from langchain.llms import CTransformers
46
  import streamlit as st
47
 
48
+ FILE_LOADER_MAPPING = {
49
+ ".csv": (CSVLoader, {"encoding": "utf-8"}),
50
+ ".doc": (UnstructuredWordDocumentLoader, {}),
51
+ ".docx": (UnstructuredWordDocumentLoader, {}),
52
+ ".enex": (EverNoteLoader, {}),
53
+ ".epub": (UnstructuredEPubLoader, {}),
54
+ ".html": (UnstructuredHTMLLoader, {}),
55
+ ".md": (UnstructuredMarkdownLoader, {}),
56
+ ".odt": (UnstructuredODTLoader, {}),
57
+ ".pdf": (PyPDFLoader, {}),
58
+ ".ppt": (UnstructuredPowerPointLoader, {}),
59
+ ".pptx": (UnstructuredPowerPointLoader, {}),
60
+ ".txt": (TextLoader, {"encoding": "utf8"}),
61
+ ".ipynb": (NotebookLoader, {}),
62
+ ".py": (PythonLoader, {}),
63
+ # Add more mappings for other file extensions and loaders as needed
64
+ }
65
+
66
  def load_model():
67
  # model_path=HuggingFaceHub(repo_id="vilsonrodrigues/falcon-7b-instruct-sharded")
68
 
 
112
  # temperature=temperature, # type: ignore
113
  )
114
  return llm
 
115
 
116
+ def load_document(
117
+ file_path: str,
118
+ mapping: dict = FILE_LOADER_MAPPING,
119
+ default_loader: BaseLoader = UnstructuredFileLoader,
120
+ ) -> Document:
121
+ # Choose loader from mapping, load default if no match found
122
+ ext = "." + file_path.rsplit(".", 1)[-1]
123
+ if ext in mapping:
124
+ loader_class, loader_args = mapping[ext]
125
+ loader = loader_class(file_path, **loader_args)
126
+ else:
127
+ loader = default_loader(file_path)
128
+ loaded_documents = []
129
+ loaded_documents.extend(loader.load())
130
+ return loaded_documents
131
+
132
+ def create_vector_database(loaded_documents):
133
  # DB_DIR: str = os.path.join(ABS_PATH, "db")
134
  """
135
  Creates a vector database using document loaders and embeddings.
 
140
 
141
  """
142
  # Initialize loaders for different file types
143
+ # loaders = {
144
+ # "pdf": UnstructuredPDFLoader,
145
+ # "md": UnstructuredMarkdownLoader,
146
+ # "txt": TextLoader,
147
+ # "csv": CSVLoader,
148
+ # "py": PythonLoader,
149
+ # "epub": UnstructuredEPubLoader,
150
+ # "html": UnstructuredHTMLLoader,
151
+ # "ppt": UnstructuredPowerPointLoader,
152
+ # "pptx": UnstructuredPowerPointLoader,
153
+ # "doc": UnstructuredWordDocumentLoader,
154
+ # "docx": UnstructuredWordDocumentLoader,
155
+ # "odt": UnstructuredODTLoader,
156
+ # "ipynb": NotebookLoader
157
+ # }
158
  # pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
159
  # markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
160
  # text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
 
187
  # }
188
 
189
  # Load documents from uploaded files using the appropriate loaders
190
+ # loaded_documents = []
191
+ # for uploaded_file in uploaded_files:
192
+ # # file_extension = os.path.splitext(uploaded_file.name)[-1].lower()[1:]
193
+ # file_extension = os.path.splitext(uploaded_file.name)[-1][1:].lower()
194
+ # if file_extension in loaders:
195
+ # # Read the content of the uploaded file
196
+ # file_content = uploaded_file.read()
197
 
198
+ # # Pass the content to the loader for processing
199
+ # loader = loaders[file_extension](file_content)
200
+ # loaded_documents.extend(loader.load())
201
  # loader = loaders[file_extension](uploaded_file)
202
  # # loader = loader_cls.load(uploaded_file.name) # Pass the file path to the loader constructor
203
  # # # content = uploaded_file.read() # Read the file content
 
367
  llm = load_model()
368
  prompt = set_custom_prompt()
369
  CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
370
+ loaded_documents = load_document(uploaded_files)
371
+ db = create_vector_database(loaded_documents)
372
  response = retrieve_bot_answer(query)
373
 
374
  # Display bot response