Vitomir Jovanović commited on
Commit
a1d6c7a
·
1 Parent(s): e9fda99

Streamlit + Readme.md

Browse files
README.md CHANGED
@@ -10,4 +10,24 @@ pinned: false
10
  short_description: Semantic Search engine with Faiss
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: Semantic Search engine with Faiss
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-
14
+
15
+ ### For local deployment run
16
+ ```
17
+ main.py
18
+ ```
19
+ which will create swagger app with endpoints on [localhost:8084](http://127.0.0.1:8084/docs). First endpoint return the top k semanticaly most similar prompts with query prompt. Second endpoint returns all similarites with query (only applicable for very small datasets).
20
+
21
+ ```
22
+ data_reader.py
23
+ ```
24
+ creates data of various prompts for encoding into vector database. Local database encoded only 6000 prompts.
25
+
26
+ Faiss index that is used is small and not optimized, used for experimental datasets. Search is brute force, not optimised.
27
+
28
+ ### Streamlit
29
+ ```
30
+ streamlit run app.py
31
+ ```
32
+ should be run for streamlit app, it can be assessed locally on http://localhost:8501.
33
+
app.py CHANGED
@@ -6,13 +6,22 @@ from models.Query import Query, SimilarPrompt, SearchResponse, PromptVector, Vec
6
  from sentence_transformers import SentenceTransformer
7
  import os
8
 
9
- # Path to your prompts data (you need to upload this file to your Hugging Face space)
10
- prompt_path = "models/prompts_data.jsonl" # Update this to the correct path in your space
11
-
12
- # Initialize search engine and model
13
- prompts = load_prompts_from_jsonl(prompt_path)
14
- search_engine = PromptSearchEngine()
15
- search_engine.add_prompts_to_vector_database(prompts)
 
 
 
 
 
 
 
 
 
16
 
17
  # Streamlit App Interface
18
  st.title("Prompt Search Engine")
@@ -27,17 +36,12 @@ k = st.number_input("Number of similar prompts to retrieve:", min_value=1, max_v
27
  # Button to trigger search
28
  if st.button("Search Prompts"):
29
  if query_input:
30
- query = Query(prompt=query_input)
31
- similar_prompts, distances = search_engine.most_similar(query.prompt, top_k=k)
32
 
33
  # Format and display search results
34
- response = [
35
- SimilarPrompt(prompt=prompt, distance=float(distance))
36
- for prompt, distance in zip(similar_prompts, distances)
37
- ]
38
- st.write("Search Results:")
39
- for result in response:
40
- st.write(f"Prompt: {result.prompt}, Distance: {result.distance}")
41
  else:
42
  st.error("Please enter a prompt.")
43
 
@@ -47,54 +51,8 @@ st.write("### Vector Similarities")
47
 
48
  if st.button("Retrieve All Vector Similarities"):
49
  if query_input:
50
- query = Query(prompt=query_input)
51
- query_embedding = search_engine.model.encode([query.prompt]) # Encode the prompt to a vector
52
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
53
-
54
- # Format and display vector similarities
55
- response = [
56
- PromptVector(vector=index, distance=float(distance))
57
- for index, distance in enumerate(all_similarities)
58
- ]
59
- st.write("Vector Similarities:")
60
- for result in response:
61
- st.write(f"Vector Index: {result.vector}, Distance: {result.distance}")
62
  else:
63
- st.error("Please enter a prompt.")
64
-
65
-
66
-
67
- # # streamlit_app.py
68
- # import streamlit as st
69
- # import requests
70
-
71
- # # Streamlit app title
72
- # st.title("Top K Search with Vector DataBase")
73
-
74
- # # FastAPI endpoint URL
75
- # # url = "http://localhost:8084/search/"
76
- # url = "https://huggingface.co/search/"
77
-
78
- # # Input fields in Streamlit
79
- # id = st.text_input("Enter ID:", value="1")
80
- # prompt = st.text_input("Enter your prompt:")
81
- # k = st.number_input("Top K results:", min_value=1, max_value=100, value=3)
82
-
83
- # # Trigger the search when the button is clicked
84
- # if st.button("Search"):
85
- # # Construct the request payload
86
- # payload = {
87
- # "id": id,
88
- # "prompt": prompt,
89
- # "k": k
90
- # }
91
-
92
- # # Make the POST request
93
- # response = requests.post(url, json=payload)
94
-
95
- # # Handle the response
96
- # if response.status_code == 200:
97
- # results = response.json()
98
- # st.write(results)
99
- # else:
100
- # st.error(f"Error: {response.status_code} - {response.text}")
 
6
  from sentence_transformers import SentenceTransformer
7
  import os
8
 
9
+ # Cache the prompts data to avoid reloading every time
10
+ @st.cache_data
11
+ def load_prompts():
12
+ prompt_path = "models/prompts_data.jsonl"
13
+ return load_prompts_from_jsonl(prompt_path)
14
+
15
+ # Cache the search engine initialization
16
+ @st.cache_resource
17
+ def get_search_engine():
18
+ search_engine = PromptSearchEngine()
19
+ prompts = load_prompts()
20
+ search_engine.add_prompts_to_vector_database(prompts)
21
+ return search_engine
22
+
23
+ # Initialize search engine only once
24
+ search_engine = get_search_engine()
25
 
26
  # Streamlit App Interface
27
  st.title("Prompt Search Engine")
 
36
  # Button to trigger search
37
  if st.button("Search Prompts"):
38
  if query_input:
39
+ similar_prompts, distances = search_engine.most_similar(query_input, top_k=k)
 
40
 
41
  # Format and display search results
42
+ st.write(f"Search Results: ")
43
+ for i, (prompt, distance) in enumerate(zip(similar_prompts, distances)):
44
+ st.write(f"{i+1}. Prompt: {prompt}, Distance: {distance}")
 
 
 
 
45
  else:
46
  st.error("Please enter a prompt.")
47
 
 
51
 
52
  if st.button("Retrieve All Vector Similarities"):
53
  if query_input:
54
+ query_embedding = search_engine.model.encode([query_input]) # Encode the prompt to a vector
 
55
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
56
+ st.write(f"Vector Similarities: {all_similarities}")
 
 
 
 
 
 
 
 
57
  else:
58
+ st.error("Please enter a prompt.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -30,7 +30,7 @@ def read_root():
30
 
31
  @app.post("/search/")
32
  async def search_prompts(query: Query, k: int = 3):
33
- print(f'Prompt: {query.prompt}')
34
  similar_prompts, distances = search_engine.most_similar(query.prompt, top_k=k)
35
  print(f'Similar Prompts {similar_prompts}')
36
  print(f'Distances {distances}')
@@ -48,6 +48,9 @@ async def all_vectors(query: Query):
48
 
49
  query_embedding = search_engine.model.encode([query.prompt]) # Encode the prompt to a vector
50
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
 
 
 
51
  response = [
52
  PromptVector(vector=index, distance=float(distance))
53
  for index, distance in enumerate(all_similarities)
@@ -56,7 +59,7 @@ async def all_vectors(query: Query):
56
 
57
  if __name__ == "__main__":
58
  # Server Config
59
- SERVER_HOST_IP = socket.gethostbyname(socket.gethostname())
60
- # SERVER_HOST_IP = socket.gethostbyname("localhost") # for local deployment
61
  SERVER_PORT = int(8084)
62
  uvicorn.run(app, host=SERVER_HOST_IP, port=SERVER_PORT)
 
30
 
31
  @app.post("/search/")
32
  async def search_prompts(query: Query, k: int = 3):
33
+ print(f'Prompt: {query}')
34
  similar_prompts, distances = search_engine.most_similar(query.prompt, top_k=k)
35
  print(f'Similar Prompts {similar_prompts}')
36
  print(f'Distances {distances}')
 
48
 
49
  query_embedding = search_engine.model.encode([query.prompt]) # Encode the prompt to a vector
50
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
51
+ print(f'Prompt: {query}')
52
+ print(f'All Vector Similarities: {all_similarities}')
53
+ print(40*'****')
54
  response = [
55
  PromptVector(vector=index, distance=float(distance))
56
  for index, distance in enumerate(all_similarities)
 
59
 
60
  if __name__ == "__main__":
61
  # Server Config
62
+ # SERVER_HOST_IP = socket.gethostbyname(socket.gethostname())
63
+ SERVER_HOST_IP = socket.gethostbyname("localhost") # for local deployment
64
  SERVER_PORT = int(8084)
65
  uvicorn.run(app, host=SERVER_HOST_IP, port=SERVER_PORT)
models/__pycache__/data_reader.cpython-312.pyc CHANGED
Binary files a/models/__pycache__/data_reader.cpython-312.pyc and b/models/__pycache__/data_reader.cpython-312.pyc differ
 
models/__pycache__/prompt_search_engine.cpython-312.pyc CHANGED
Binary files a/models/__pycache__/prompt_search_engine.cpython-312.pyc and b/models/__pycache__/prompt_search_engine.cpython-312.pyc differ
 
models/data_reader.py CHANGED
@@ -32,11 +32,13 @@ def read_data(jsonl_file_path):
32
  print(row)
33
 
34
  def load_prompts_from_jsonl(file_path):
 
35
  prompts = []
36
  with open(file_path, 'r') as f:
37
  for line in f:
38
  data = json.loads(line) # Each line is a JSON object
39
  prompts.append(data) # Extract the 'prompt' field
 
40
  return prompts
41
 
42
 
 
32
  print(row)
33
 
34
  def load_prompts_from_jsonl(file_path):
35
+ print('Loading prompts from:', file_path)
36
  prompts = []
37
  with open(file_path, 'r') as f:
38
  for line in f:
39
  data = json.loads(line) # Each line is a JSON object
40
  prompts.append(data) # Extract the 'prompt' field
41
+ print("Data loaded successfully.")
42
  return prompts
43
 
44
 
models/prompt_search_engine.py CHANGED
@@ -6,6 +6,7 @@ import faiss
6
 
7
  class PromptSearchEngine:
8
  def __init__(self, model_name='bert-base-nli-mean-tokens'):
 
9
  self.model = SentenceTransformer(model_name)
10
  # Initialize FAISS index with right number of dimensions
11
  self.embedding_dimension = self.model.get_sentence_embedding_dimension()
@@ -14,13 +15,16 @@ class PromptSearchEngine:
14
 
15
 
16
  def add_prompts_to_vector_database(self, prompts):
 
17
  embeddings = self.model.encode(prompts)
18
  self.index.add(np.array(embeddings).astype('float32'))
19
  self.prompts_track.extend(prompts)
 
20
 
21
 
22
  def most_similar(self, query, top_k=5):
23
- # Encode the query
 
24
  query_embedding = self.model.encode([query]).astype('float32')
25
 
26
  # Optimizovana pretraga ali moramo promeniti vrstu indeksa
@@ -37,7 +41,7 @@ class PromptSearchEngine:
37
  Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector.
38
  Returns: The cosine similarity between the query vector and the corpus vectors.
39
  """
40
-
41
  query_vector = np.array(query_vector).astype('float32')
42
  query_norm = query_vector / np.linalg.norm(query_vector)
43
 
 
6
 
7
  class PromptSearchEngine:
8
  def __init__(self, model_name='bert-base-nli-mean-tokens'):
9
+ print("Search engine started!")
10
  self.model = SentenceTransformer(model_name)
11
  # Initialize FAISS index with right number of dimensions
12
  self.embedding_dimension = self.model.get_sentence_embedding_dimension()
 
15
 
16
 
17
  def add_prompts_to_vector_database(self, prompts):
18
+ print("Data encoding started...")
19
  embeddings = self.model.encode(prompts)
20
  self.index.add(np.array(embeddings).astype('float32'))
21
  self.prompts_track.extend(prompts)
22
+ print("Data encoding completed!")
23
 
24
 
25
  def most_similar(self, query, top_k=5):
26
+ # Encode the
27
+ print('Finding the most similar vectors')
28
  query_embedding = self.model.encode([query]).astype('float32')
29
 
30
  # Optimizovana pretraga ali moramo promeniti vrstu indeksa
 
41
  Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector.
42
  Returns: The cosine similarity between the query vector and the corpus vectors.
43
  """
44
+ print('Searching for all similarities...')
45
  query_vector = np.array(query_vector).astype('float32')
46
  query_norm = query_vector / np.linalg.norm(query_vector)
47