Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
import time
|
4 |
import joblib
|
5 |
import msgpack
|
@@ -196,6 +198,9 @@ def load_models_and_documents():
|
|
196 |
providers=provider
|
197 |
)
|
198 |
|
|
|
|
|
|
|
199 |
client = QdrantClient(':memory:')
|
200 |
collection_name = 'collection_demo'
|
201 |
|
@@ -307,26 +312,31 @@ def chunk_documents(docs, dense_model, sparse_model):
|
|
307 |
breakpoint_threshold_type='standard_deviation'
|
308 |
)
|
309 |
|
310 |
-
texts = []
|
311 |
for doc in docs:
|
312 |
texts.append(doc.page_content)
|
|
|
313 |
|
|
|
314 |
documents = []
|
315 |
|
316 |
-
def create_document(text: str, i: int):
|
317 |
index = -1
|
318 |
for chunk in text_splitter.split_text(text):
|
|
|
319 |
if text_splitter._add_start_index:
|
320 |
index = text.find(chunk, index + 1)
|
321 |
-
|
|
|
|
|
322 |
|
323 |
joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
|
324 |
-
joblib.delayed(create_document)(text, i) for i, text in enumerate(texts))
|
325 |
|
326 |
-
|
327 |
|
328 |
-
dense_embeddings = dense_model.embed_documents(
|
329 |
-
sparse_embeddings = list(sparse_model.embed(
|
330 |
|
331 |
return documents, dense_embeddings, sparse_embeddings
|
332 |
|
@@ -335,8 +345,6 @@ if __name__ == '__main__':
|
|
335 |
layout="wide"
|
336 |
)
|
337 |
st.title("Video Game Assistant")
|
338 |
-
num_cores = joblib.cpu_count()
|
339 |
-
print(num_cores)
|
340 |
|
341 |
client, collection_name, llm, dense_model, sparse_model = load_models_and_documents()
|
342 |
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
import nltk
|
4 |
+
import copy
|
5 |
import time
|
6 |
import joblib
|
7 |
import msgpack
|
|
|
198 |
providers=provider
|
199 |
)
|
200 |
|
201 |
+
nltk.download('punkt')
|
202 |
+
nltk.download('averaged_perceptron_tagger')
|
203 |
+
|
204 |
client = QdrantClient(':memory:')
|
205 |
collection_name = 'collection_demo'
|
206 |
|
|
|
312 |
breakpoint_threshold_type='standard_deviation'
|
313 |
)
|
314 |
|
315 |
+
texts, metadatas = [], []
|
316 |
for doc in docs:
|
317 |
texts.append(doc.page_content)
|
318 |
+
metadatas.append(doc.metadata)
|
319 |
|
320 |
+
_metadatas = metadatas or [{}] * len(texts)
|
321 |
documents = []
|
322 |
|
323 |
+
def create_document(text: str, i: int, _metadatas: list):
|
324 |
index = -1
|
325 |
for chunk in text_splitter.split_text(text):
|
326 |
+
metadata = copy.deepcopy(_metadatas[i])
|
327 |
if text_splitter._add_start_index:
|
328 |
index = text.find(chunk, index + 1)
|
329 |
+
metadata['start_index'] = index
|
330 |
+
new_doc = Document(page_content=chunk, metadata=metadata)
|
331 |
+
documents.append(new_doc)
|
332 |
|
333 |
joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=1, require='sharedmem')(
|
334 |
+
joblib.delayed(create_document)(text, i, _metadatas) for i, text in enumerate(texts))
|
335 |
|
336 |
+
docs = [doc.page_content for doc in documents]
|
337 |
|
338 |
+
dense_embeddings = dense_model.embed_documents(docs,32)
|
339 |
+
sparse_embeddings = list(sparse_model.embed(docs, 32))
|
340 |
|
341 |
return documents, dense_embeddings, sparse_embeddings
|
342 |
|
|
|
345 |
layout="wide"
|
346 |
)
|
347 |
st.title("Video Game Assistant")
|
|
|
|
|
348 |
|
349 |
client, collection_name, llm, dense_model, sparse_model = load_models_and_documents()
|
350 |
|