Update app.py
Browse files
app.py
CHANGED
@@ -283,18 +283,26 @@ def load_models_and_documents():
|
|
283 |
|
284 |
def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: OptimumEncoder, sparse_model: SparseTextEmbedding):
|
285 |
import time
|
286 |
-
text_splitter =
|
287 |
-
dense_model
|
|
|
288 |
)
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
start_dense = time.time()
|
300 |
dense_embeddings = dense_model(documents, 32, convert_to_numpy=True)
|
|
|
283 |
|
284 |
def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: OptimumEncoder, sparse_model: SparseTextEmbedding):
|
285 |
import time
|
286 |
+
text_splitter = SemanticChunker(
|
287 |
+
dense_model,
|
288 |
+
breakpoint_threshold_type='standard_deviation'
|
289 |
)
|
290 |
+
_metadatas = metadatas or [{}] * len(texts)
|
291 |
+
documents = []
|
292 |
+
metadatas_docs = []
|
293 |
+
|
294 |
+
def create_document(text: str, i: int, _metadatas):
|
295 |
+
index = -1
|
296 |
+
for chunk in text_splitter.split_text(text):
|
297 |
+
metadata = copy.deepcopy(_metadatas[i])
|
298 |
+
if text_splitter._add_start_index:
|
299 |
+
index = text.find(chunk, index + 1)
|
300 |
+
metadata['start_index'] = index
|
301 |
+
documents.append(chunk)
|
302 |
+
metadatas_docs.append(metadata)
|
303 |
+
|
304 |
+
joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
|
305 |
+
joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
|
306 |
|
307 |
start_dense = time.time()
|
308 |
dense_embeddings = dense_model(documents, 32, convert_to_numpy=True)
|