devve1 commited on
Commit
892dfd9
1 Parent(s): 15f1836

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -9
app.py CHANGED
@@ -341,18 +341,27 @@ def chunk_documents(texts, metadatas, dense_model, sparse_model):
341
  dense_model,
342
  breakpoint_threshold_type='standard_deviation'
343
  )
344
- docs = []
345
- for (text, metadata) in zip(texts, metadatas):
346
- docs.append(Document(page_content=text, metadata=metadata))
 
 
 
 
 
 
 
 
 
 
347
 
348
- chunks = [doc for doc in text_splitter.transform_documents(docs)]
349
- texts_docs = [chunk.page_content for chunk in chunks]
350
- metadatas_docs = [chunk.metadata for chunk in chunks]
351
 
352
- dense_embeddings = dense_model.embed_documents(texts_docs, 32)
353
- sparse_embeddings = list(sparse_model.embed(texts_docs, 32))
354
 
355
- return texts_docs, metadatas_docs, dense_embeddings, sparse_embeddings
356
 
357
  def on_change_documents_only():
358
  st.session_state.qa_prompt = lambda query, context: (
 
341
  dense_model,
342
  breakpoint_threshold_type='standard_deviation'
343
  )
344
+ _metadatas = metadatas or [{}] * len(texts)
345
+ documents = []
346
+ metadatas_docs = []
347
+
348
+ def create_document(text: str, i: int, _metadatas):
349
+ index = -1
350
+ for chunk in text_splitter.split_text(text):
351
+ metadata = copy.deepcopy(_metadatas[i])
352
+ if text_splitter._add_start_index:
353
+ index = text.find(chunk, index + 1)
354
+ metadata['start_index'] = index
355
+ documents.append(chunk)
356
+ metadatas_docs.append(metadata)
357
 
358
+ joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
359
+ joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
 
360
 
361
+ dense_embeddings = dense_model.embed_documents(documents, 32)
362
+ sparse_embeddings = list(sparse_model.embed(documents, 32))
363
 
364
+ return documents, metadatas_docs, dense_embeddings, sparse_embeddings
365
 
366
  def on_change_documents_only():
367
  st.session_state.qa_prompt = lambda query, context: (