Spaces:
Runtime error
Runtime error
Commit
·
f241b97
1
Parent(s):
523d9c4
Update app.py
Browse files
app.py
CHANGED
@@ -149,75 +149,6 @@ def create_vector_database(loaded_documents):
|
|
149 |
and finally persists the embeddings into a Chroma vector database.
|
150 |
|
151 |
"""
|
152 |
-
# Initialize loaders for different file types
|
153 |
-
# loaders = {
|
154 |
-
# "pdf": UnstructuredPDFLoader,
|
155 |
-
# "md": UnstructuredMarkdownLoader,
|
156 |
-
# "txt": TextLoader,
|
157 |
-
# "csv": CSVLoader,
|
158 |
-
# "py": PythonLoader,
|
159 |
-
# "epub": UnstructuredEPubLoader,
|
160 |
-
# "html": UnstructuredHTMLLoader,
|
161 |
-
# "ppt": UnstructuredPowerPointLoader,
|
162 |
-
# "pptx": UnstructuredPowerPointLoader,
|
163 |
-
# "doc": UnstructuredWordDocumentLoader,
|
164 |
-
# "docx": UnstructuredWordDocumentLoader,
|
165 |
-
# "odt": UnstructuredODTLoader,
|
166 |
-
# "ipynb": NotebookLoader
|
167 |
-
# }
|
168 |
-
# pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
|
169 |
-
# markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
|
170 |
-
# text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
|
171 |
-
# csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
|
172 |
-
# python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
|
173 |
-
# epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
|
174 |
-
# html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
|
175 |
-
# ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
|
176 |
-
# pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
|
177 |
-
# doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
|
178 |
-
# docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
|
179 |
-
# odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
|
180 |
-
# notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
|
181 |
-
# FILE_LOADER_MAPPING = {
|
182 |
-
# ".csv": (CSVLoader, {"encoding": "utf-8"}),
|
183 |
-
# ".doc": (UnstructuredWordDocumentLoader, {}),
|
184 |
-
# ".docx": (UnstructuredWordDocumentLoader, {}),
|
185 |
-
# ".enex": (EverNoteLoader, {}),
|
186 |
-
# ".epub": (UnstructuredEPubLoader, {}),
|
187 |
-
# ".html": (UnstructuredHTMLLoader, {}),
|
188 |
-
# ".md": (UnstructuredMarkdownLoader, {}),
|
189 |
-
# ".odt": (UnstructuredODTLoader, {}),
|
190 |
-
# ".pdf": (PyPDFLoader, {}),
|
191 |
-
# ".ppt": (UnstructuredPowerPointLoader, {}),
|
192 |
-
# ".pptx": (UnstructuredPowerPointLoader, {}),
|
193 |
-
# ".txt": (TextLoader, {"encoding": "utf8"}),
|
194 |
-
# ".ipynb": (NotebookLoader, {}),
|
195 |
-
# ".py": (PythonLoader, {}),
|
196 |
-
# # Add more mappings for other file extensions and loaders as needed
|
197 |
-
# }
|
198 |
-
|
199 |
-
# Load documents from uploaded files using the appropriate loaders
|
200 |
-
# loaded_documents = []
|
201 |
-
# for uploaded_file in uploaded_files:
|
202 |
-
# # file_extension = os.path.splitext(uploaded_file.name)[-1].lower()[1:]
|
203 |
-
# file_extension = os.path.splitext(uploaded_file.name)[-1][1:].lower()
|
204 |
-
# if file_extension in loaders:
|
205 |
-
# # Read the content of the uploaded file
|
206 |
-
# file_content = uploaded_file.read()
|
207 |
-
|
208 |
-
# # Pass the content to the loader for processing
|
209 |
-
# loader = loaders[file_extension](file_content)
|
210 |
-
# loaded_documents.extend(loader.load())
|
211 |
-
# loader = loaders[file_extension](uploaded_file)
|
212 |
-
# # loader = loader_cls.load(uploaded_file.name) # Pass the file path to the loader constructor
|
213 |
-
# # # content = uploaded_file.read() # Read the file content
|
214 |
-
# loaded_documents.extend(loader.load())
|
215 |
-
|
216 |
-
# all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
|
217 |
-
|
218 |
-
# Load documents from all loaders
|
219 |
-
# for loader in all_loaders:
|
220 |
-
# loaded_documents.extend(loader.load())
|
221 |
|
222 |
# Split loaded documents into chunks
|
223 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|
|
|
149 |
and finally persists the embeddings into a Chroma vector database.
|
150 |
|
151 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
# Split loaded documents into chunks
|
154 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|