Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -27,7 +27,7 @@ from streamlit_navigation_bar import st_navbar
|
|
27 |
from ppt_chunker import ppt_chunk
|
28 |
from unstructured.cleaners.core import clean
|
29 |
from unstructured.partition.pptx import partition_pptx
|
30 |
-
from fastembed import
|
31 |
from unstructured.nlp.tokenize import download_nltk_packages
|
32 |
from scipy.sparse import csr_matrix, save_npz, load_npz, vstack
|
33 |
from langchain_experimental.text_splitter import SemanticChunker
|
@@ -67,23 +67,6 @@ icon_to_types = {
|
|
67 |
'Excel')
|
68 |
}
|
69 |
|
70 |
-
def make_points(texts: List[str], metadatas: List[dict], dense: List[List[float]], sparse: List[SparseEmbedding])-> List[PointStruct]:
|
71 |
-
points = []
|
72 |
-
for idx, (text, metadata, sparse_vector, dense_vector) in enumerate(zip(texts, metadatas, sparse, dense)):
|
73 |
-
sparse_vec = SparseVector(indices=sparse_vector.indices.tolist(), values=sparse_vector.values.tolist())
|
74 |
-
point = PointStruct(
|
75 |
-
id=idx,
|
76 |
-
vector={
|
77 |
-
"text-sparse": sparse_vec,
|
78 |
-
"text-dense": dense_vector,
|
79 |
-
},
|
80 |
-
payload={
|
81 |
-
"text": text,
|
82 |
-
"metadata": metadata
|
83 |
-
}
|
84 |
-
)
|
85 |
-
points.append(point)
|
86 |
-
return points
|
87 |
|
88 |
def transform_query(query: str) -> str:
|
89 |
""" For retrieval, add the prompt for query (not for documents).
|
@@ -325,14 +308,18 @@ def load_models_and_documents():
|
|
325 |
sparse_embeddings.append(embedding)
|
326 |
|
327 |
st.write('Ingesting saved documents on disk into our Qdrant Vector Database...')
|
328 |
-
|
329 |
client.upsert(
|
330 |
collection_name,
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
|
|
336 |
)
|
337 |
)
|
338 |
|
|
|
27 |
from ppt_chunker import ppt_chunk
|
28 |
from unstructured.cleaners.core import clean
|
29 |
from unstructured.partition.pptx import partition_pptx
|
30 |
+
from fastembed import SparseTextEmbedding
|
31 |
from unstructured.nlp.tokenize import download_nltk_packages
|
32 |
from scipy.sparse import csr_matrix, save_npz, load_npz, vstack
|
33 |
from langchain_experimental.text_splitter import SemanticChunker
|
|
|
67 |
'Excel')
|
68 |
}
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def transform_query(query: str) -> str:
|
72 |
""" For retrieval, add the prompt for query (not for documents).
|
|
|
308 |
sparse_embeddings.append(embedding)
|
309 |
|
310 |
st.write('Ingesting saved documents on disk into our Qdrant Vector Database...')
|
311 |
+
|
312 |
client.upsert(
|
313 |
collection_name,
|
314 |
+
points=Batch(
|
315 |
+
payload={
|
316 |
+
'texts': docs_texts,
|
317 |
+
'metadatas': docs_metadatas
|
318 |
+
}
|
319 |
+
vectors={
|
320 |
+
'text-dense': dense_embeddings,
|
321 |
+
'text-sparse': sparse_embeddings
|
322 |
+
}
|
323 |
)
|
324 |
)
|
325 |
|