Jokica17 commited on
Commit
cd20a25
·
1 Parent(s): fc456a9

Added backend `app` module and core engine logic:

Browse files

- `vectorizer.py` for prompt vectorization
- `scorer.py` for cosine similarity calculations
- `engine.py` to manage the prompt search engine workflow
- `api.py` to expose a RESTful API for prompt search functionality
- included `requirements.txt` to define app module dependencies

Files changed (6) hide show
  1. app/__init__.py +0 -0
  2. app/api.py +21 -0
  3. app/engine.py +41 -0
  4. app/requirements.txt +4 -0
  5. app/scorer.py +20 -0
  6. app/vectorizer.py +23 -0
app/__init__.py ADDED
File without changes
app/api.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, HTTPException
2
+
3
+ app = FastAPI()
4
+
5
+
6
+ @app.get("/search")
7
+ def search(query: str, n: int = Query(5, ge=1, le=10)):
8
+ """
9
+ Endpoint for querying the search engine.
10
+ Args:
11
+ query (str): The search query.
12
+ n (int): Number of results to return (default: 5).
13
+ Returns:
14
+ dict: Query results.
15
+ """
16
+ search_engine = app.state.search_engine
17
+ if not search_engine:
18
+ raise HTTPException(status_code=500, detail="Search engine not initialized.")
19
+
20
+ results = search_engine.most_similar(query, n)
21
+ return {"query": query, "results": results}
app/engine.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+ from typing import Sequence, List, Tuple
4
+
5
+ from app.vectorizer import Vectorizer
6
+ from app.scorer import cosine_similarity
7
+
8
+
9
+ class PromptSearchEngine:
10
+
11
+ def __init__(self, prompts: Sequence[str]) -> None:
12
+ """
13
+ Initialize search engine by vectorizing prompt corpus.
14
+ Vectorized prompt corpus should be used to find the top n most
15
+ similar prompts w.r.t. user’s input prompt.
16
+ Args:
17
+ prompts: The sequence of raw prompts from the dataset.
18
+ """
19
+ self.prompts = prompts
20
+ model = SentenceTransformer("all-MiniLM-L6-v2")
21
+ self.vectorizer = Vectorizer(model)
22
+ self.corpus_vectors = self.vectorizer.transform(prompts)
23
+
24
+ def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
25
+ """
26
+ Return top n most similar prompts from corpus.
27
+ Input query prompt should be vectorized with chosen Vectorizer.
28
+ After that, use the cosine_similarity function to get the top n most similar prompts from the corpus.
29
+ Args:
30
+ query: The raw query prompt input from the user.
31
+ n: The number of similar prompts returned from the corpus.
32
+ Returns:
33
+ The list of top n most similar prompts from the corpus along
34
+ with similarity scores. Note that returned prompts are verbatim.
35
+ """
36
+ query_vector = self.vectorizer.transform([query])
37
+ similarities = cosine_similarity(query_vector, self.corpus_vectors)
38
+ top_n_vectors_with_scores = np.argsort(similarities)[-n:][::-1]
39
+
40
+ # Convert similarities to Python float and return the top-n prompts
41
+ return [(float(similarities[i]), self.prompts[i]) for i in top_n_vectors_with_scores]
app/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ datasets
4
+ sentence-transformers
app/scorer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def cosine_similarity(
5
+ query_vector: np.ndarray,
6
+ corpus_vectors: np.ndarray
7
+ ) -> np.ndarray:
8
+ """
9
+ Calculate cosine similarity between prompt vectors.
10
+ Args:
11
+ query_vector: Vectorized prompt query of shape (1, D).
12
+ corpus_vectors: Vectorized prompt corpus of shape (N, D).
13
+ Returns:
14
+ The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same.
15
+ """
16
+ query_norm = np.linalg.norm(query_vector, axis=1)[0]
17
+ corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
18
+ dot_products = np.dot(corpus_vectors, query_vector.T).flatten()
19
+ similarities = dot_products / (query_norm * corpus_norms)
20
+ return similarities
app/vectorizer.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import Sequence
3
+
4
+
5
+ class Vectorizer:
6
+ def __init__(self, model) -> None:
7
+ """
8
+ Initialize the vectorizer with a pre-trained embedding model.
9
+ Args:
10
+ model: The pre-trained embedding model to use for transforming prompts.
11
+ """
12
+ self.model = model
13
+
14
+ def transform(self, prompts: Sequence[str]) -> np.ndarray:
15
+ """
16
+ Transform texts into numerical vectors using the specified model.
17
+ Args:
18
+ prompts: The sequence of raw corpus prompts.
19
+ Returns:
20
+ Vectorized prompts as a numpy array.
21
+ """
22
+ # Using 'encode' method for SentenceTransformer model; may need updating for other models (e.g. 'embed')
23
+ return np.array(self.model.encode(prompts, show_progress_bar=True))