devve1 commited on
Commit
e8532d4
1 Parent(s): 50c35e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -283,18 +283,26 @@ def load_models_and_documents():
283
 
284
  def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: OptimumEncoder, sparse_model: SparseTextEmbedding):
285
  import time
286
- text_splitter = StatisticalChunker(
287
- dense_model
 
288
  )
289
- start = time.time()
290
- chunks = text_splitter(docs=texts, metadatas=metadatas)
291
- end = time.time()
292
- final = end - start
293
- print(f'FINAL CHUNKING TIME: {final}')
294
-
295
- documents_and_metadatas = [(chunk.content, chunk.metadata) for sub_chunk in chunks for chunk in sub_chunk]
296
- documents, metadatas_docs = [list(t) for t in zip(*documents_and_metadatas)]
297
- print(f'CHUNKS : {documents}')
 
 
 
 
 
 
 
298
 
299
  start_dense = time.time()
300
  dense_embeddings = dense_model(documents, 32, convert_to_numpy=True)
 
283
 
284
  def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: OptimumEncoder, sparse_model: SparseTextEmbedding):
285
  import time
286
+ text_splitter = SemanticChunker(
287
+ dense_model,
288
+ breakpoint_threshold_type='standard_deviation'
289
  )
290
+ _metadatas = metadatas or [{}] * len(texts)
291
+ documents = []
292
+ metadatas_docs = []
293
+
294
+ def create_document(text: str, i: int, _metadatas):
295
+ index = -1
296
+ for chunk in text_splitter.split_text(text):
297
+ metadata = copy.deepcopy(_metadatas[i])
298
+ if text_splitter._add_start_index:
299
+ index = text.find(chunk, index + 1)
300
+ metadata['start_index'] = index
301
+ documents.append(chunk)
302
+ metadatas_docs.append(metadata)
303
+
304
+ joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
305
+ joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
306
 
307
  start_dense = time.time()
308
  dense_embeddings = dense_model(documents, 32, convert_to_numpy=True)