devve1 commited on
Commit
03c7545
1 Parent(s): 19bdcd8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -9
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import re
 
 
3
  import time
4
  import joblib
5
  import msgpack
@@ -196,6 +198,9 @@ def load_models_and_documents():
196
  providers=provider
197
  )
198
 
 
 
 
199
  client = QdrantClient(':memory:')
200
  collection_name = 'collection_demo'
201
 
@@ -307,26 +312,31 @@ def chunk_documents(docs, dense_model, sparse_model):
307
  breakpoint_threshold_type='standard_deviation'
308
  )
309
 
310
- texts = []
311
  for doc in docs:
312
  texts.append(doc.page_content)
 
313
 
 
314
  documents = []
315
 
316
- def create_document(text: str, i: int):
317
  index = -1
318
  for chunk in text_splitter.split_text(text):
 
319
  if text_splitter._add_start_index:
320
  index = text.find(chunk, index + 1)
321
- documents.append(chunk)
 
 
322
 
323
  joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
324
- joblib.delayed(create_document)(text, i) for i, text in enumerate(texts))
325
 
326
- # documents = [doc.page_content for doc in text_splitter.transform_documents(list(docs))]
327
 
328
- dense_embeddings = dense_model.embed_documents(documents,32)
329
- sparse_embeddings = list(sparse_model.embed(documents, 32))
330
 
331
  return documents, dense_embeddings, sparse_embeddings
332
 
@@ -335,8 +345,6 @@ if __name__ == '__main__':
335
  layout="wide"
336
  )
337
  st.title("Video Game Assistant")
338
- num_cores = joblib.cpu_count()
339
- print(num_cores)
340
 
341
  client, collection_name, llm, dense_model, sparse_model = load_models_and_documents()
342
 
 
1
  import os
2
  import re
3
+ import nltk
4
+ import copy
5
  import time
6
  import joblib
7
  import msgpack
 
198
  providers=provider
199
  )
200
 
201
+ nltk.download('punkt')
202
+ nltk.download('averaged_perceptron_tagger')
203
+
204
  client = QdrantClient(':memory:')
205
  collection_name = 'collection_demo'
206
 
 
312
  breakpoint_threshold_type='standard_deviation'
313
  )
314
 
315
+ texts, metadatas = [], []
316
  for doc in docs:
317
  texts.append(doc.page_content)
318
+ metadatas.append(doc.metadata)
319
 
320
+ _metadatas = metadatas or [{}] * len(texts)
321
  documents = []
322
 
323
+ def create_document(text: str, i: int, _metadatas: list):
324
  index = -1
325
  for chunk in text_splitter.split_text(text):
326
+ metadata = copy.deepcopy(_metadatas[i])
327
  if text_splitter._add_start_index:
328
  index = text.find(chunk, index + 1)
329
+ metadata['start_index'] = index
330
+ new_doc = Document(page_content=chunk, metadata=metadata)
331
+ documents.append(new_doc)
332
 
333
  joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
334
+ joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
335
 
336
+ docs = [doc.page_content for doc in documents]
337
 
338
+ dense_embeddings = dense_model.embed_documents(docs,32)
339
+ sparse_embeddings = list(sparse_model.embed(docs, 32))
340
 
341
  return documents, dense_embeddings, sparse_embeddings
342
 
 
345
  layout="wide"
346
  )
347
  st.title("Video Game Assistant")
 
 
348
 
349
  client, collection_name, llm, dense_model, sparse_model = load_models_and_documents()
350