Update app.py
Browse files
app.py
CHANGED
@@ -287,22 +287,11 @@ def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: Optimu
|
|
287 |
dense_model,
|
288 |
breakpoint_threshold_type='standard_deviation'
|
289 |
)
|
290 |
-
|
291 |
-
documents = []
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
index = -1
|
296 |
-
for chunk in text_splitter.split_text(text):
|
297 |
-
metadata = copy.deepcopy(_metadatas[i])
|
298 |
-
if text_splitter._add_start_index:
|
299 |
-
index = text.find(chunk, index + 1)
|
300 |
-
metadata['start_index'] = index
|
301 |
-
documents.append(chunk)
|
302 |
-
metadatas_docs.append(metadata)
|
303 |
-
|
304 |
-
joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
|
305 |
-
joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
|
306 |
|
307 |
start_dense = time.time()
|
308 |
dense_embeddings = dense_model.embed_documents(documents, 32, convert_to_numpy=True)
|
|
|
287 |
dense_model,
|
288 |
breakpoint_threshold_type='standard_deviation'
|
289 |
)
|
290 |
+
docs = text_splitter.create_documents(texts, metadatas)
|
291 |
+
documents, metadatas_docs = zip(*[(doc.page_content, doc.metadata) for doc in documents])
|
292 |
+
|
293 |
+
documents = list(documents)
|
294 |
+
metadatas_docs = list(metadatas_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
start_dense = time.time()
|
297 |
dense_embeddings = dense_model.embed_documents(documents, 32, convert_to_numpy=True)
|