devve1 commited on
Commit
628be14
1 Parent(s): 6f55a57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -18
app.py CHANGED
@@ -3,7 +3,6 @@ import re
3
  import lz4
4
  import time
5
  import uuid
6
- import vllm
7
  import torch
8
  import spacy
9
  import base64
@@ -20,10 +19,10 @@ from typing import List, Dict
20
  from ppt_chunker import ppt_chunk
21
  from outlines import models, generate
22
  from qdrant_client import QdrantClient
23
- from optimum_encoder import OptimumEncoder
24
  from unstructured.cleaners.core import clean
25
  from streamlit_navigation_bar import st_navbar
26
  from vllm.sampling_params import SamplingParams
 
27
  from fastembed import SparseTextEmbedding, SparseEmbedding
28
  from unstructured.nlp.tokenize import download_nltk_packages
29
  from huggingface_hub import snapshot_download, hf_hub_download
@@ -89,7 +88,7 @@ def transform_query(query: str) -> str:
89
  """
90
  return f'Represent this sentence for searching relevant passages: {query}'
91
 
92
- def query_hybrid_search(query: str, client: QdrantClient, collection_name: str, dense_model: OptimumEncoder, sparse_model: SparseTextEmbedding):
93
  dense_embeddings = dense_model.embed_query(transform_query(query))[0]
94
  sparse_embeddings = list(sparse_model.query_embed(query))[0]
95
 
@@ -103,7 +102,7 @@ def query_hybrid_search(query: str, client: QdrantClient, collection_name: str,
103
  with_vectors=False,
104
  with_payload=True,
105
  limit=10,
106
- score_threshold=0.9
107
  )
108
 
109
  def build_prompt_conv():
@@ -304,11 +303,15 @@ def load_models_and_documents():
304
  container = st.empty()
305
 
306
  with container.status("Load AI Models and Prepare Documents...", expanded=True) as status:
307
- st.write('Downloading and Loading MixedBread Mxbai Dense Embedding Model under ONNX with Nvidia CUDA as backend...')
308
 
309
- dense_model = OptimumEncoder(
310
- device="cuda",
311
- cache_dir=os.getenv('HF_HOME')
 
 
 
 
312
  )
313
 
314
  st.write('Downloading and Loading Qdrant BM42 Sparse Embedding Model under ONNX using the CPU...')
@@ -319,15 +322,10 @@ def load_models_and_documents():
319
  providers=['CPUExecutionProvider']
320
  )
321
 
322
- st.write('Downloading Mistral Nemo AI Model...')
323
-
324
- model_path = snapshot_download('casperhansen/mistral-nemo-instruct-2407-awq')
325
-
326
- st.write('Loading Mistral Nemo AI Model quantized with AWQ and using Outlines + vLLM Engine as backend...')
327
 
328
- llm = vllm.LLM(
329
- model=model_path,
330
- tokenizer=model_path,
331
  tensor_parallel_size=1,
332
  trust_remote_code=True,
333
  enforce_eager=True,
@@ -516,7 +514,8 @@ def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: Optimu
516
  documents.append(doc.page_content)
517
 
518
  start_dense = time.time()
519
- dense_embeddings = dense_model.embed_documents(documents)
 
520
  end_dense = time.time()
521
  final_dense = end_dense - start_dense
522
  print(f'DENSE TIME: {final_dense}')
@@ -529,7 +528,7 @@ def chunk_documents(texts: List[str], metadatas: List[dict], dense_model: Optimu
529
  final_sparse = end_sparse - start_sparse
530
  print(f'SPARSE TIME: {final_sparse}')
531
 
532
- return payload_docs, dense_embeddings, sparse_embeddings
533
 
534
  def on_change_documents_only():
535
  if st.session_state.documents_only:
 
3
  import lz4
4
  import time
5
  import uuid
 
6
  import torch
7
  import spacy
8
  import base64
 
19
  from ppt_chunker import ppt_chunk
20
  from outlines import models, generate
21
  from qdrant_client import QdrantClient
 
22
  from unstructured.cleaners.core import clean
23
  from streamlit_navigation_bar import st_navbar
24
  from vllm.sampling_params import SamplingParams
25
+ from vllm import LLM, PoolingParams, PoolingType
26
  from fastembed import SparseTextEmbedding, SparseEmbedding
27
  from unstructured.nlp.tokenize import download_nltk_packages
28
  from huggingface_hub import snapshot_download, hf_hub_download
 
88
  """
89
  return f'Represent this sentence for searching relevant passages: {query}'
90
 
91
+ def query_hybrid_search(query: str, client: QdrantClient, collection_name: str, dense_model: LLM, sparse_model: SparseTextEmbedding):
92
  dense_embeddings = dense_model.embed_query(transform_query(query))[0]
93
  sparse_embeddings = list(sparse_model.query_embed(query))[0]
94
 
 
102
  with_vectors=False,
103
  with_payload=True,
104
  limit=10,
105
+ score_threshold=0.95
106
  )
107
 
108
  def build_prompt_conv():
 
303
  container = st.empty()
304
 
305
  with container.status("Load AI Models and Prepare Documents...", expanded=True) as status:
306
+ st.write('Downloading and Loading MixedBread Mxbai Dense Embedding Model with vLLM as backend...')
307
 
308
+ dense_model = LLM(
309
+ model='mixedbread-ai/mxbai-embed-large-v1',
310
+ enforce_eager=True,
311
+ max_model_len=512,
312
+ max_num_seqs=32,
313
+ tensor_parallel_size=1,
314
+ dtype=torch.float16
315
  )
316
 
317
  st.write('Downloading and Loading Qdrant BM42 Sparse Embedding Model under ONNX using the CPU...')
 
322
  providers=['CPUExecutionProvider']
323
  )
324
 
325
+ st.write('Downloading and Loading Mistral Nemo AI Model quantized with AWQ and using Outlines + vLLM Engine as backend...')
 
 
 
 
326
 
327
+ llm = LLM(
328
+ model='casperhansen/mistral-nemo-instruct-2407-awq',
 
329
  tensor_parallel_size=1,
330
  trust_remote_code=True,
331
  enforce_eager=True,
 
514
  documents.append(doc.page_content)
515
 
516
  start_dense = time.time()
517
+ dense_embeddings = dense_model.encode(documents, pooling_params=PoolingParams(pooling_type=PoolingType.MEAN))
518
+ print(f'DENSE EMBED : {dense_embeddings}')
519
  end_dense = time.time()
520
  final_dense = end_dense - start_dense
521
  print(f'DENSE TIME: {final_dense}')
 
528
  final_sparse = end_sparse - start_sparse
529
  print(f'SPARSE TIME: {final_sparse}')
530
 
531
+ return payload_docs, dense_embeddings[0].outputs.embedding, sparse_embeddings
532
 
533
  def on_change_documents_only():
534
  if st.session_state.documents_only: