devve1 commited on
Commit
b8c4816
1 Parent(s): 310438e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -16
app.py CHANGED
@@ -287,22 +287,11 @@ def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: Optimu
287
  dense_model,
288
  breakpoint_threshold_type='standard_deviation'
289
  )
290
- _metadatas = metadatas or [{}] * len(texts)
291
- documents = []
292
- metadatas_docs = []
293
-
294
- def create_document(text: str, i: int, _metadatas):
295
- index = -1
296
- for chunk in text_splitter.split_text(text):
297
- metadata = copy.deepcopy(_metadatas[i])
298
- if text_splitter._add_start_index:
299
- index = text.find(chunk, index + 1)
300
- metadata['start_index'] = index
301
- documents.append(chunk)
302
- metadatas_docs.append(metadata)
303
-
304
- joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
305
- joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
306
 
307
  start_dense = time.time()
308
  dense_embeddings = dense_model.embed_documents(documents, 32, convert_to_numpy=True)
 
287
  dense_model,
288
  breakpoint_threshold_type='standard_deviation'
289
  )
290
+ docs = text_splitter.create_documents(texts, metadatas)
291
+ documents, metadatas_docs = zip(*[(doc.page_content, doc.metadata) for doc in documents])
292
+
293
+ documents = list(documents)
294
+ metadatas_docs = list(metadatas_docs)
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  start_dense = time.time()
297
  dense_embeddings = dense_model.embed_documents(documents, 32, convert_to_numpy=True)