Update app.py
Browse files
app.py
CHANGED
@@ -341,18 +341,27 @@ def chunk_documents(texts, metadatas, dense_model, sparse_model):
|
|
341 |
dense_model,
|
342 |
breakpoint_threshold_type='standard_deviation'
|
343 |
)
|
344 |
-
|
345 |
-
|
346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
metadatas_docs = [chunk.metadata for chunk in chunks]
|
351 |
|
352 |
-
dense_embeddings = dense_model.embed_documents(
|
353 |
-
sparse_embeddings = list(sparse_model.embed(
|
354 |
|
355 |
-
return
|
356 |
|
357 |
def on_change_documents_only():
|
358 |
st.session_state.qa_prompt = lambda query, context: (
|
|
|
341 |
dense_model,
|
342 |
breakpoint_threshold_type='standard_deviation'
|
343 |
)
|
344 |
+
_metadatas = metadatas or [{}] * len(texts)
|
345 |
+
documents = []
|
346 |
+
metadatas_docs = []
|
347 |
+
|
348 |
+
def create_document(text: str, i: int, _metadatas):
|
349 |
+
index = -1
|
350 |
+
for chunk in text_splitter.split_text(text):
|
351 |
+
metadata = copy.deepcopy(_metadatas[i])
|
352 |
+
if text_splitter._add_start_index:
|
353 |
+
index = text.find(chunk, index + 1)
|
354 |
+
metadata['start_index'] = index
|
355 |
+
documents.append(chunk)
|
356 |
+
metadatas_docs.append(metadata)
|
357 |
|
358 |
+
joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
|
359 |
+
joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
|
|
|
360 |
|
361 |
+
dense_embeddings = dense_model.embed_documents(documents, 32)
|
362 |
+
sparse_embeddings = list(sparse_model.embed(documents, 32))
|
363 |
|
364 |
+
return documents, metadatas_docs, dense_embeddings, sparse_embeddings
|
365 |
|
366 |
def on_change_documents_only():
|
367 |
st.session_state.qa_prompt = lambda query, context: (
|